1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2010, 2023 Red Hat, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_trans.h" 12 #include "xfs_mount.h" 13 #include "xfs_btree.h" 14 #include "xfs_alloc_btree.h" 15 #include "xfs_alloc.h" 16 #include "xfs_discard.h" 17 #include "xfs_error.h" 18 #include "xfs_extent_busy.h" 19 #include "xfs_trace.h" 20 #include "xfs_log.h" 21 #include "xfs_ag.h" 22 #include "xfs_health.h" 23 #include "xfs_rtbitmap.h" 24 #include "xfs_rtgroup.h" 25 26 /* 27 * Notes on an efficient, low latency fstrim algorithm 28 * 29 * We need to walk the filesystem free space and issue discards on the free 30 * space that meet the search criteria (size and location). We cannot issue 31 * discards on extents that might be in use, or are so recently in use they are 32 * still marked as busy. To serialise against extent state changes whilst we are 33 * gathering extents to trim, we must hold the AGF lock to lock out other 34 * allocations and extent free operations that might change extent state. 35 * 36 * However, we cannot just hold the AGF for the entire AG free space walk whilst 37 * we issue discards on each free space that is found. Storage devices can have 38 * extremely slow discard implementations (e.g. ceph RBD) and so walking a 39 * couple of million free extents and issuing synchronous discards on each 40 * extent can take a *long* time. Whilst we are doing this walk, nothing else 41 * can access the AGF, and we can stall transactions and hence the log whilst 42 * modifications wait for the AGF lock to be released. This can lead hung tasks 43 * kicking the hung task timer and rebooting the system. This is bad. 44 * 45 * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI 46 * lock, gathers a range of inode cluster buffers that are allocated, drops the 47 * AGI lock and then reads all the inode cluster buffers and processes them. It 48 * loops doing this, using a cursor to keep track of where it is up to in the AG 49 * for each iteration to restart the INOBT lookup from. 50 * 51 * We can't do this exactly with free space - once we drop the AGF lock, the 52 * state of the free extent is out of our control and we cannot run a discard 53 * safely on it in this situation. Unless, of course, we've marked the free 54 * extent as busy and undergoing a discard operation whilst we held the AGF 55 * locked. 56 * 57 * This is exactly how online discard works - free extents are marked busy when 58 * they are freed, and once the extent free has been committed to the journal, 59 * the busy extent record is marked as "undergoing discard" and the discard is 60 * then issued on the free extent. Once the discard completes, the busy extent 61 * record is removed and the extent is able to be allocated again. 62 * 63 * In the context of fstrim, if we find a free extent we need to discard, we 64 * don't have to discard it immediately. All we need to do it record that free 65 * extent as being busy and under discard, and all the allocation routines will 66 * now avoid trying to allocate it. Hence if we mark the extent as busy under 67 * the AGF lock, we can safely discard it without holding the AGF lock because 68 * nothing will attempt to allocate that free space until the discard completes. 69 * 70 * This also allows us to issue discards asynchronously like we do with online 71 * discard, and so for fast devices fstrim will run much faster as we can have 72 * multiple discard operations in flight at once, as well as pipeline the free 73 * extent search so that it overlaps in flight discard IO. 74 */ 75 76 #define XFS_DISCARD_MAX_EXAMINE (100) 77 78 struct workqueue_struct *xfs_discard_wq; 79 80 static void 81 xfs_discard_endio_work( 82 struct work_struct *work) 83 { 84 struct xfs_busy_extents *extents = 85 container_of(work, struct xfs_busy_extents, endio_work); 86 87 xfs_extent_busy_clear(&extents->extent_list, false); 88 kfree(extents->owner); 89 } 90 91 /* 92 * Queue up the actual completion to a thread to avoid IRQ-safe locking for 93 * eb_lock. 94 */ 95 static void 96 xfs_discard_endio( 97 struct bio *bio) 98 { 99 struct xfs_busy_extents *extents = bio->bi_private; 100 101 INIT_WORK(&extents->endio_work, xfs_discard_endio_work); 102 queue_work(xfs_discard_wq, &extents->endio_work); 103 bio_put(bio); 104 } 105 106 static inline struct block_device * 107 xfs_group_bdev( 108 const struct xfs_group *xg) 109 { 110 struct xfs_mount *mp = xg->xg_mount; 111 112 switch (xg->xg_type) { 113 case XG_TYPE_AG: 114 return mp->m_ddev_targp->bt_bdev; 115 case XG_TYPE_RTG: 116 return mp->m_rtdev_targp->bt_bdev; 117 default: 118 ASSERT(0); 119 break; 120 } 121 return NULL; 122 } 123 124 /* 125 * Walk the discard list and issue discards on all the busy extents in the 126 * list. We plug and chain the bios so that we only need a single completion 127 * call to clear all the busy extents once the discards are complete. 128 */ 129 int 130 xfs_discard_extents( 131 struct xfs_mount *mp, 132 struct xfs_busy_extents *extents) 133 { 134 struct xfs_extent_busy *busyp; 135 struct bio *bio = NULL; 136 struct blk_plug plug; 137 int error = 0; 138 139 blk_start_plug(&plug); 140 list_for_each_entry(busyp, &extents->extent_list, list) { 141 trace_xfs_discard_extent(busyp->group, busyp->bno, 142 busyp->length); 143 144 error = __blkdev_issue_discard(xfs_group_bdev(busyp->group), 145 xfs_gbno_to_daddr(busyp->group, busyp->bno), 146 XFS_FSB_TO_BB(mp, busyp->length), 147 GFP_KERNEL, &bio); 148 if (error && error != -EOPNOTSUPP) { 149 xfs_info(mp, 150 "discard failed for extent [0x%llx,%u], error %d", 151 (unsigned long long)busyp->bno, 152 busyp->length, 153 error); 154 break; 155 } 156 } 157 158 if (bio) { 159 bio->bi_private = extents; 160 bio->bi_end_io = xfs_discard_endio; 161 submit_bio(bio); 162 } else { 163 xfs_discard_endio_work(&extents->endio_work); 164 } 165 blk_finish_plug(&plug); 166 167 return error; 168 } 169 170 /* 171 * Care must be taken setting up the trim cursor as the perags may not have been 172 * initialised when the cursor is initialised. e.g. a clean mount which hasn't 173 * read in AGFs and the first operation run on the mounted fs is a trim. This 174 * can result in perag fields that aren't initialised until 175 * xfs_trim_gather_extents() calls xfs_alloc_read_agf() to lock down the AG for 176 * the free space search. 177 */ 178 struct xfs_trim_cur { 179 xfs_agblock_t start; 180 xfs_extlen_t count; 181 xfs_agblock_t end; 182 xfs_extlen_t minlen; 183 bool by_bno; 184 }; 185 186 static int 187 xfs_trim_gather_extents( 188 struct xfs_perag *pag, 189 struct xfs_trim_cur *tcur, 190 struct xfs_busy_extents *extents) 191 { 192 struct xfs_mount *mp = pag_mount(pag); 193 struct xfs_trans *tp; 194 struct xfs_btree_cur *cur; 195 struct xfs_buf *agbp; 196 int error; 197 int i; 198 int batch = XFS_DISCARD_MAX_EXAMINE; 199 200 /* 201 * Force out the log. This means any transactions that might have freed 202 * space before we take the AGF buffer lock are now on disk, and the 203 * volatile disk cache is flushed. 204 */ 205 xfs_log_force(mp, XFS_LOG_SYNC); 206 207 error = xfs_trans_alloc_empty(mp, &tp); 208 if (error) 209 return error; 210 211 error = xfs_alloc_read_agf(pag, tp, 0, &agbp); 212 if (error) 213 goto out_trans_cancel; 214 215 /* 216 * First time through tcur->count will not have been initialised as 217 * pag->pagf_longest is not guaranteed to be valid before we read 218 * the AGF buffer above. 219 */ 220 if (!tcur->count) 221 tcur->count = pag->pagf_longest; 222 223 if (tcur->by_bno) { 224 /* sub-AG discard request always starts at tcur->start */ 225 cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); 226 error = xfs_alloc_lookup_le(cur, tcur->start, 0, &i); 227 if (!error && !i) 228 error = xfs_alloc_lookup_ge(cur, tcur->start, 0, &i); 229 } else if (tcur->start == 0) { 230 /* first time through a by-len starts with max length */ 231 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); 232 error = xfs_alloc_lookup_ge(cur, 0, tcur->count, &i); 233 } else { 234 /* nth time through a by-len starts where we left off */ 235 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); 236 error = xfs_alloc_lookup_le(cur, tcur->start, tcur->count, &i); 237 } 238 if (error) 239 goto out_del_cursor; 240 if (i == 0) { 241 /* nothing of that length left in the AG, we are done */ 242 tcur->count = 0; 243 goto out_del_cursor; 244 } 245 246 /* 247 * Loop until we are done with all extents that are large 248 * enough to be worth discarding or we hit batch limits. 249 */ 250 while (i) { 251 xfs_agblock_t fbno; 252 xfs_extlen_t flen; 253 254 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); 255 if (error) 256 break; 257 if (XFS_IS_CORRUPT(mp, i != 1)) { 258 xfs_btree_mark_sick(cur); 259 error = -EFSCORRUPTED; 260 break; 261 } 262 263 if (--batch <= 0) { 264 /* 265 * Update the cursor to point at this extent so we 266 * restart the next batch from this extent. 267 */ 268 tcur->start = fbno; 269 tcur->count = flen; 270 break; 271 } 272 273 /* 274 * If the extent is entirely outside of the range we are 275 * supposed to skip it. Do not bother to trim down partially 276 * overlapping ranges for now. 277 */ 278 if (fbno + flen < tcur->start) { 279 trace_xfs_discard_exclude(pag_group(pag), fbno, flen); 280 goto next_extent; 281 } 282 if (fbno > tcur->end) { 283 trace_xfs_discard_exclude(pag_group(pag), fbno, flen); 284 if (tcur->by_bno) { 285 tcur->count = 0; 286 break; 287 } 288 goto next_extent; 289 } 290 291 /* Trim the extent returned to the range we want. */ 292 if (fbno < tcur->start) { 293 flen -= tcur->start - fbno; 294 fbno = tcur->start; 295 } 296 if (fbno + flen > tcur->end + 1) 297 flen = tcur->end - fbno + 1; 298 299 /* Too small? Give up. */ 300 if (flen < tcur->minlen) { 301 trace_xfs_discard_toosmall(pag_group(pag), fbno, flen); 302 if (tcur->by_bno) 303 goto next_extent; 304 tcur->count = 0; 305 break; 306 } 307 308 /* 309 * If any blocks in the range are still busy, skip the 310 * discard and try again the next time. 311 */ 312 if (xfs_extent_busy_search(pag_group(pag), fbno, flen)) { 313 trace_xfs_discard_busy(pag_group(pag), fbno, flen); 314 goto next_extent; 315 } 316 317 xfs_extent_busy_insert_discard(pag_group(pag), fbno, flen, 318 &extents->extent_list); 319 next_extent: 320 if (tcur->by_bno) 321 error = xfs_btree_increment(cur, 0, &i); 322 else 323 error = xfs_btree_decrement(cur, 0, &i); 324 if (error) 325 break; 326 327 /* 328 * If there's no more records in the tree, we are done. Set the 329 * cursor block count to 0 to indicate to the caller that there 330 * is no more extents to search. 331 */ 332 if (i == 0) 333 tcur->count = 0; 334 } 335 336 /* 337 * If there was an error, release all the gathered busy extents because 338 * we aren't going to issue a discard on them any more. 339 */ 340 if (error) 341 xfs_extent_busy_clear(&extents->extent_list, false); 342 out_del_cursor: 343 xfs_btree_del_cursor(cur, error); 344 out_trans_cancel: 345 xfs_trans_cancel(tp); 346 return error; 347 } 348 349 static bool 350 xfs_trim_should_stop(void) 351 { 352 return fatal_signal_pending(current) || freezing(current); 353 } 354 355 /* 356 * Iterate the free list gathering extents and discarding them. We need a cursor 357 * for the repeated iteration of gather/discard loop, so use the longest extent 358 * we found in the last batch as the key to start the next. 359 */ 360 static int 361 xfs_trim_perag_extents( 362 struct xfs_perag *pag, 363 xfs_agblock_t start, 364 xfs_agblock_t end, 365 xfs_extlen_t minlen) 366 { 367 struct xfs_trim_cur tcur = { 368 .start = start, 369 .end = end, 370 .minlen = minlen, 371 }; 372 int error = 0; 373 374 if (start != 0 || end != pag_group(pag)->xg_block_count) 375 tcur.by_bno = true; 376 377 do { 378 struct xfs_busy_extents *extents; 379 380 extents = kzalloc(sizeof(*extents), GFP_KERNEL); 381 if (!extents) { 382 error = -ENOMEM; 383 break; 384 } 385 386 extents->owner = extents; 387 INIT_LIST_HEAD(&extents->extent_list); 388 389 error = xfs_trim_gather_extents(pag, &tcur, extents); 390 if (error) { 391 kfree(extents); 392 break; 393 } 394 395 /* 396 * We hand the extent list to the discard function here so the 397 * discarded extents can be removed from the busy extent list. 398 * This allows the discards to run asynchronously with gathering 399 * the next round of extents to discard. 400 * 401 * However, we must ensure that we do not reference the extent 402 * list after this function call, as it may have been freed by 403 * the time control returns to us. 404 */ 405 error = xfs_discard_extents(pag_mount(pag), extents); 406 if (error) 407 break; 408 409 if (xfs_trim_should_stop()) 410 break; 411 412 } while (tcur.count != 0); 413 414 return error; 415 416 } 417 418 static int 419 xfs_trim_datadev_extents( 420 struct xfs_mount *mp, 421 xfs_daddr_t start, 422 xfs_daddr_t end, 423 xfs_extlen_t minlen) 424 { 425 xfs_agnumber_t start_agno, end_agno; 426 xfs_agblock_t start_agbno, end_agbno; 427 struct xfs_perag *pag = NULL; 428 xfs_daddr_t ddev_end; 429 int last_error = 0, error; 430 431 ddev_end = min_t(xfs_daddr_t, end, 432 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1); 433 434 start_agno = xfs_daddr_to_agno(mp, start); 435 start_agbno = xfs_daddr_to_agbno(mp, start); 436 end_agno = xfs_daddr_to_agno(mp, ddev_end); 437 end_agbno = xfs_daddr_to_agbno(mp, ddev_end); 438 439 while ((pag = xfs_perag_next_range(mp, pag, start_agno, end_agno))) { 440 xfs_agblock_t agend = pag_group(pag)->xg_block_count; 441 442 if (pag_agno(pag) == end_agno) 443 agend = end_agbno; 444 error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen); 445 if (error) 446 last_error = error; 447 448 if (xfs_trim_should_stop()) { 449 xfs_perag_rele(pag); 450 break; 451 } 452 start_agbno = 0; 453 } 454 455 return last_error; 456 } 457 458 #ifdef CONFIG_XFS_RT 459 struct xfs_trim_rtdev { 460 /* list of rt extents to free */ 461 struct list_head extent_list; 462 463 /* minimum length that caller allows us to trim */ 464 xfs_rtblock_t minlen_fsb; 465 466 /* restart point for the rtbitmap walk */ 467 xfs_rtxnum_t restart_rtx; 468 469 /* stopping point for the current rtbitmap walk */ 470 xfs_rtxnum_t stop_rtx; 471 }; 472 473 struct xfs_rtx_busy { 474 struct list_head list; 475 xfs_rtblock_t bno; 476 xfs_rtblock_t length; 477 }; 478 479 static void 480 xfs_discard_free_rtdev_extents( 481 struct xfs_trim_rtdev *tr) 482 { 483 struct xfs_rtx_busy *busyp, *n; 484 485 list_for_each_entry_safe(busyp, n, &tr->extent_list, list) { 486 list_del_init(&busyp->list); 487 kfree(busyp); 488 } 489 } 490 491 /* 492 * Walk the discard list and issue discards on all the busy extents in the 493 * list. We plug and chain the bios so that we only need a single completion 494 * call to clear all the busy extents once the discards are complete. 495 */ 496 static int 497 xfs_discard_rtdev_extents( 498 struct xfs_mount *mp, 499 struct xfs_trim_rtdev *tr) 500 { 501 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 502 struct xfs_rtx_busy *busyp; 503 struct bio *bio = NULL; 504 struct blk_plug plug; 505 xfs_rtblock_t start = NULLRTBLOCK, length = 0; 506 int error = 0; 507 508 blk_start_plug(&plug); 509 list_for_each_entry(busyp, &tr->extent_list, list) { 510 if (start == NULLRTBLOCK) 511 start = busyp->bno; 512 length += busyp->length; 513 514 trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length); 515 516 error = __blkdev_issue_discard(bdev, 517 xfs_rtb_to_daddr(mp, busyp->bno), 518 XFS_FSB_TO_BB(mp, busyp->length), 519 GFP_NOFS, &bio); 520 if (error) 521 break; 522 } 523 xfs_discard_free_rtdev_extents(tr); 524 525 if (bio) { 526 error = submit_bio_wait(bio); 527 if (error == -EOPNOTSUPP) 528 error = 0; 529 if (error) 530 xfs_info(mp, 531 "discard failed for rtextent [0x%llx,%llu], error %d", 532 (unsigned long long)start, 533 (unsigned long long)length, 534 error); 535 bio_put(bio); 536 } 537 blk_finish_plug(&plug); 538 539 return error; 540 } 541 542 static int 543 xfs_trim_gather_rtextent( 544 struct xfs_rtgroup *rtg, 545 struct xfs_trans *tp, 546 const struct xfs_rtalloc_rec *rec, 547 void *priv) 548 { 549 struct xfs_trim_rtdev *tr = priv; 550 struct xfs_rtx_busy *busyp; 551 xfs_rtblock_t rbno, rlen; 552 553 if (rec->ar_startext > tr->stop_rtx) { 554 /* 555 * If we've scanned a large number of rtbitmap blocks, update 556 * the cursor to point at this extent so we restart the next 557 * batch from this extent. 558 */ 559 tr->restart_rtx = rec->ar_startext; 560 return -ECANCELED; 561 } 562 563 rbno = xfs_rtx_to_rtb(rtg, rec->ar_startext); 564 rlen = xfs_rtbxlen_to_blen(rtg_mount(rtg), rec->ar_extcount); 565 566 /* Ignore too small. */ 567 if (rlen < tr->minlen_fsb) { 568 trace_xfs_discard_rttoosmall(rtg_mount(rtg), rbno, rlen); 569 return 0; 570 } 571 572 busyp = kzalloc(sizeof(struct xfs_rtx_busy), GFP_KERNEL); 573 if (!busyp) 574 return -ENOMEM; 575 576 busyp->bno = rbno; 577 busyp->length = rlen; 578 INIT_LIST_HEAD(&busyp->list); 579 list_add_tail(&busyp->list, &tr->extent_list); 580 581 tr->restart_rtx = rec->ar_startext + rec->ar_extcount; 582 return 0; 583 } 584 585 /* Trim extents on an !rtgroups realtime device */ 586 static int 587 xfs_trim_rtextents( 588 struct xfs_rtgroup *rtg, 589 xfs_rtxnum_t low, 590 xfs_rtxnum_t high, 591 xfs_daddr_t minlen) 592 { 593 struct xfs_mount *mp = rtg_mount(rtg); 594 struct xfs_trim_rtdev tr = { 595 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), 596 .extent_list = LIST_HEAD_INIT(tr.extent_list), 597 }; 598 struct xfs_trans *tp; 599 int error; 600 601 error = xfs_trans_alloc_empty(mp, &tp); 602 if (error) 603 return error; 604 605 /* 606 * Walk the free ranges between low and high. The query_range function 607 * trims the extents returned. 608 */ 609 do { 610 tr.stop_rtx = low + xfs_rtbitmap_rtx_per_rbmblock(mp); 611 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 612 error = xfs_rtalloc_query_range(rtg, tp, low, high, 613 xfs_trim_gather_rtextent, &tr); 614 615 if (error == -ECANCELED) 616 error = 0; 617 if (error) { 618 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 619 xfs_discard_free_rtdev_extents(&tr); 620 break; 621 } 622 623 if (list_empty(&tr.extent_list)) { 624 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 625 break; 626 } 627 628 error = xfs_discard_rtdev_extents(mp, &tr); 629 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 630 if (error) 631 break; 632 633 low = tr.restart_rtx; 634 } while (!xfs_trim_should_stop() && low <= high); 635 636 xfs_trans_cancel(tp); 637 return error; 638 } 639 640 struct xfs_trim_rtgroup { 641 /* list of rtgroup extents to free */ 642 struct xfs_busy_extents *extents; 643 644 /* minimum length that caller allows us to trim */ 645 xfs_rtblock_t minlen_fsb; 646 647 /* restart point for the rtbitmap walk */ 648 xfs_rtxnum_t restart_rtx; 649 650 /* number of extents to examine before stopping to issue discard ios */ 651 int batch; 652 653 /* number of extents queued for discard */ 654 int queued; 655 }; 656 657 static int 658 xfs_trim_gather_rtgroup_extent( 659 struct xfs_rtgroup *rtg, 660 struct xfs_trans *tp, 661 const struct xfs_rtalloc_rec *rec, 662 void *priv) 663 { 664 struct xfs_trim_rtgroup *tr = priv; 665 xfs_rgblock_t rgbno; 666 xfs_extlen_t len; 667 668 if (--tr->batch <= 0) { 669 /* 670 * If we've checked a large number of extents, update the 671 * cursor to point at this extent so we restart the next batch 672 * from this extent. 673 */ 674 tr->restart_rtx = rec->ar_startext; 675 return -ECANCELED; 676 } 677 678 rgbno = xfs_rtx_to_rgbno(rtg, rec->ar_startext); 679 len = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount); 680 681 /* Ignore too small. */ 682 if (len < tr->minlen_fsb) { 683 trace_xfs_discard_toosmall(rtg_group(rtg), rgbno, len); 684 return 0; 685 } 686 687 /* 688 * If any blocks in the range are still busy, skip the discard and try 689 * again the next time. 690 */ 691 if (xfs_extent_busy_search(rtg_group(rtg), rgbno, len)) { 692 trace_xfs_discard_busy(rtg_group(rtg), rgbno, len); 693 return 0; 694 } 695 696 xfs_extent_busy_insert_discard(rtg_group(rtg), rgbno, len, 697 &tr->extents->extent_list); 698 699 tr->queued++; 700 tr->restart_rtx = rec->ar_startext + rec->ar_extcount; 701 return 0; 702 } 703 704 /* Trim extents in this rtgroup using the busy extent machinery. */ 705 static int 706 xfs_trim_rtgroup_extents( 707 struct xfs_rtgroup *rtg, 708 xfs_rtxnum_t low, 709 xfs_rtxnum_t high, 710 xfs_daddr_t minlen) 711 { 712 struct xfs_mount *mp = rtg_mount(rtg); 713 struct xfs_trim_rtgroup tr = { 714 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), 715 }; 716 struct xfs_trans *tp; 717 int error; 718 719 error = xfs_trans_alloc_empty(mp, &tp); 720 if (error) 721 return error; 722 723 /* 724 * Walk the free ranges between low and high. The query_range function 725 * trims the extents returned. 726 */ 727 do { 728 tr.extents = kzalloc(sizeof(*tr.extents), GFP_KERNEL); 729 if (!tr.extents) { 730 error = -ENOMEM; 731 break; 732 } 733 734 tr.queued = 0; 735 tr.batch = XFS_DISCARD_MAX_EXAMINE; 736 tr.extents->owner = tr.extents; 737 INIT_LIST_HEAD(&tr.extents->extent_list); 738 739 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 740 error = xfs_rtalloc_query_range(rtg, tp, low, high, 741 xfs_trim_gather_rtgroup_extent, &tr); 742 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 743 if (error == -ECANCELED) 744 error = 0; 745 if (error) { 746 kfree(tr.extents); 747 break; 748 } 749 750 if (!tr.queued) 751 break; 752 753 /* 754 * We hand the extent list to the discard function here so the 755 * discarded extents can be removed from the busy extent list. 756 * This allows the discards to run asynchronously with 757 * gathering the next round of extents to discard. 758 * 759 * However, we must ensure that we do not reference the extent 760 * list after this function call, as it may have been freed by 761 * the time control returns to us. 762 */ 763 error = xfs_discard_extents(rtg_mount(rtg), tr.extents); 764 if (error) 765 break; 766 767 low = tr.restart_rtx; 768 } while (!xfs_trim_should_stop() && low <= high); 769 770 xfs_trans_cancel(tp); 771 return error; 772 } 773 774 static int 775 xfs_trim_rtdev_extents( 776 struct xfs_mount *mp, 777 xfs_daddr_t start, 778 xfs_daddr_t end, 779 xfs_daddr_t minlen) 780 { 781 xfs_rtblock_t start_rtbno, end_rtbno; 782 xfs_rtxnum_t start_rtx, end_rtx; 783 xfs_rgnumber_t start_rgno, end_rgno; 784 xfs_daddr_t daddr_offset; 785 int last_error = 0, error; 786 struct xfs_rtgroup *rtg = NULL; 787 788 /* Shift the start and end downwards to match the rt device. */ 789 daddr_offset = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 790 if (start > daddr_offset) 791 start -= daddr_offset; 792 else 793 start = 0; 794 start_rtbno = xfs_daddr_to_rtb(mp, start); 795 start_rtx = xfs_rtb_to_rtx(mp, start_rtbno); 796 start_rgno = xfs_rtb_to_rgno(mp, start_rtbno); 797 798 if (end <= daddr_offset) 799 return 0; 800 else 801 end -= daddr_offset; 802 end_rtbno = xfs_daddr_to_rtb(mp, end); 803 end_rtx = xfs_rtb_to_rtx(mp, end_rtbno + mp->m_sb.sb_rextsize - 1); 804 end_rgno = xfs_rtb_to_rgno(mp, end_rtbno); 805 806 while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) { 807 xfs_rtxnum_t rtg_end = rtg->rtg_extents; 808 809 if (rtg_rgno(rtg) == end_rgno) 810 rtg_end = min(rtg_end, end_rtx); 811 812 if (xfs_has_rtgroups(mp)) 813 error = xfs_trim_rtgroup_extents(rtg, start_rtx, 814 rtg_end, minlen); 815 else 816 error = xfs_trim_rtextents(rtg, start_rtx, rtg_end, 817 minlen); 818 if (error) 819 last_error = error; 820 821 if (xfs_trim_should_stop()) { 822 xfs_rtgroup_rele(rtg); 823 break; 824 } 825 start_rtx = 0; 826 } 827 828 return last_error; 829 } 830 #else 831 # define xfs_trim_rtdev_extents(...) (-EOPNOTSUPP) 832 #endif /* CONFIG_XFS_RT */ 833 834 /* 835 * trim a range of the filesystem. 836 * 837 * Note: the parameters passed from userspace are byte ranges into the 838 * filesystem which does not match to the format we use for filesystem block 839 * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format 840 * is a linear address range. Hence we need to use DADDR based conversions and 841 * comparisons for determining the correct offset and regions to trim. 842 * 843 * The realtime device is mapped into the FITRIM "address space" immediately 844 * after the data device. 845 */ 846 int 847 xfs_ioc_trim( 848 struct xfs_mount *mp, 849 struct fstrim_range __user *urange) 850 { 851 unsigned int granularity = 852 bdev_discard_granularity(mp->m_ddev_targp->bt_bdev); 853 struct block_device *rt_bdev = NULL; 854 struct fstrim_range range; 855 xfs_daddr_t start, end; 856 xfs_extlen_t minlen; 857 xfs_rfsblock_t max_blocks; 858 int error, last_error = 0; 859 860 if (!capable(CAP_SYS_ADMIN)) 861 return -EPERM; 862 863 if (mp->m_rtdev_targp && !xfs_has_zoned(mp) && 864 bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev)) 865 rt_bdev = mp->m_rtdev_targp->bt_bdev; 866 if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev) 867 return -EOPNOTSUPP; 868 869 if (rt_bdev) 870 granularity = max(granularity, 871 bdev_discard_granularity(rt_bdev)); 872 873 /* 874 * We haven't recovered the log, so we cannot use our bnobt-guided 875 * storage zapping commands. 876 */ 877 if (xfs_has_norecovery(mp)) 878 return -EROFS; 879 880 if (copy_from_user(&range, urange, sizeof(range))) 881 return -EFAULT; 882 883 range.minlen = max_t(u64, granularity, range.minlen); 884 minlen = XFS_B_TO_FSB(mp, range.minlen); 885 886 /* 887 * Truncating down the len isn't actually quite correct, but using 888 * BBTOB would mean we trivially get overflows for values 889 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default 890 * used by the fstrim application. In the end it really doesn't 891 * matter as trimming blocks is an advisory interface. 892 */ 893 max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks; 894 if (range.start >= XFS_FSB_TO_B(mp, max_blocks) || 895 range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) || 896 range.len < mp->m_sb.sb_blocksize) 897 return -EINVAL; 898 899 start = BTOBB(range.start); 900 end = start + BTOBBT(range.len) - 1; 901 902 if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) { 903 error = xfs_trim_datadev_extents(mp, start, end, minlen); 904 if (error) 905 last_error = error; 906 } 907 908 if (rt_bdev && !xfs_trim_should_stop()) { 909 error = xfs_trim_rtdev_extents(mp, start, end, minlen); 910 if (error) 911 last_error = error; 912 } 913 914 if (last_error) 915 return last_error; 916 917 range.len = min_t(unsigned long long, range.len, 918 XFS_FSB_TO_B(mp, max_blocks) - range.start); 919 if (copy_to_user(urange, &range, sizeof(range))) 920 return -EFAULT; 921 return 0; 922 } 923