1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (C) 2010, 2023 Red Hat, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_trans.h" 12 #include "xfs_mount.h" 13 #include "xfs_btree.h" 14 #include "xfs_alloc_btree.h" 15 #include "xfs_alloc.h" 16 #include "xfs_discard.h" 17 #include "xfs_error.h" 18 #include "xfs_extent_busy.h" 19 #include "xfs_trace.h" 20 #include "xfs_log.h" 21 #include "xfs_ag.h" 22 #include "xfs_health.h" 23 #include "xfs_rtbitmap.h" 24 #include "xfs_rtgroup.h" 25 26 /* 27 * Notes on an efficient, low latency fstrim algorithm 28 * 29 * We need to walk the filesystem free space and issue discards on the free 30 * space that meet the search criteria (size and location). We cannot issue 31 * discards on extents that might be in use, or are so recently in use they are 32 * still marked as busy. To serialise against extent state changes whilst we are 33 * gathering extents to trim, we must hold the AGF lock to lock out other 34 * allocations and extent free operations that might change extent state. 35 * 36 * However, we cannot just hold the AGF for the entire AG free space walk whilst 37 * we issue discards on each free space that is found. Storage devices can have 38 * extremely slow discard implementations (e.g. ceph RBD) and so walking a 39 * couple of million free extents and issuing synchronous discards on each 40 * extent can take a *long* time. Whilst we are doing this walk, nothing else 41 * can access the AGF, and we can stall transactions and hence the log whilst 42 * modifications wait for the AGF lock to be released. This can lead hung tasks 43 * kicking the hung task timer and rebooting the system. This is bad. 44 * 45 * Hence we need to take a leaf from the bulkstat playbook. It takes the AGI 46 * lock, gathers a range of inode cluster buffers that are allocated, drops the 47 * AGI lock and then reads all the inode cluster buffers and processes them. It 48 * loops doing this, using a cursor to keep track of where it is up to in the AG 49 * for each iteration to restart the INOBT lookup from. 50 * 51 * We can't do this exactly with free space - once we drop the AGF lock, the 52 * state of the free extent is out of our control and we cannot run a discard 53 * safely on it in this situation. Unless, of course, we've marked the free 54 * extent as busy and undergoing a discard operation whilst we held the AGF 55 * locked. 56 * 57 * This is exactly how online discard works - free extents are marked busy when 58 * they are freed, and once the extent free has been committed to the journal, 59 * the busy extent record is marked as "undergoing discard" and the discard is 60 * then issued on the free extent. Once the discard completes, the busy extent 61 * record is removed and the extent is able to be allocated again. 62 * 63 * In the context of fstrim, if we find a free extent we need to discard, we 64 * don't have to discard it immediately. All we need to do it record that free 65 * extent as being busy and under discard, and all the allocation routines will 66 * now avoid trying to allocate it. Hence if we mark the extent as busy under 67 * the AGF lock, we can safely discard it without holding the AGF lock because 68 * nothing will attempt to allocate that free space until the discard completes. 69 * 70 * This also allows us to issue discards asynchronously like we do with online 71 * discard, and so for fast devices fstrim will run much faster as we can have 72 * multiple discard operations in flight at once, as well as pipeline the free 73 * extent search so that it overlaps in flight discard IO. 74 */ 75 76 #define XFS_DISCARD_MAX_EXAMINE (100) 77 78 struct workqueue_struct *xfs_discard_wq; 79 80 static void 81 xfs_discard_endio_work( 82 struct work_struct *work) 83 { 84 struct xfs_busy_extents *extents = 85 container_of(work, struct xfs_busy_extents, endio_work); 86 87 xfs_extent_busy_clear(&extents->extent_list, false); 88 kfree(extents->owner); 89 } 90 91 /* 92 * Queue up the actual completion to a thread to avoid IRQ-safe locking for 93 * pagb_lock. 94 */ 95 static void 96 xfs_discard_endio( 97 struct bio *bio) 98 { 99 struct xfs_busy_extents *extents = bio->bi_private; 100 101 INIT_WORK(&extents->endio_work, xfs_discard_endio_work); 102 queue_work(xfs_discard_wq, &extents->endio_work); 103 bio_put(bio); 104 } 105 106 static inline struct block_device * 107 xfs_group_bdev( 108 const struct xfs_group *xg) 109 { 110 struct xfs_mount *mp = xg->xg_mount; 111 112 switch (xg->xg_type) { 113 case XG_TYPE_AG: 114 return mp->m_ddev_targp->bt_bdev; 115 case XG_TYPE_RTG: 116 return mp->m_rtdev_targp->bt_bdev; 117 default: 118 ASSERT(0); 119 break; 120 } 121 return NULL; 122 } 123 124 /* 125 * Walk the discard list and issue discards on all the busy extents in the 126 * list. We plug and chain the bios so that we only need a single completion 127 * call to clear all the busy extents once the discards are complete. 128 */ 129 int 130 xfs_discard_extents( 131 struct xfs_mount *mp, 132 struct xfs_busy_extents *extents) 133 { 134 struct xfs_extent_busy *busyp; 135 struct bio *bio = NULL; 136 struct blk_plug plug; 137 int error = 0; 138 139 blk_start_plug(&plug); 140 list_for_each_entry(busyp, &extents->extent_list, list) { 141 trace_xfs_discard_extent(busyp->group, busyp->bno, 142 busyp->length); 143 144 error = __blkdev_issue_discard(xfs_group_bdev(busyp->group), 145 xfs_gbno_to_daddr(busyp->group, busyp->bno), 146 XFS_FSB_TO_BB(mp, busyp->length), 147 GFP_KERNEL, &bio); 148 if (error && error != -EOPNOTSUPP) { 149 xfs_info(mp, 150 "discard failed for extent [0x%llx,%u], error %d", 151 (unsigned long long)busyp->bno, 152 busyp->length, 153 error); 154 break; 155 } 156 } 157 158 if (bio) { 159 bio->bi_private = extents; 160 bio->bi_end_io = xfs_discard_endio; 161 submit_bio(bio); 162 } else { 163 xfs_discard_endio_work(&extents->endio_work); 164 } 165 blk_finish_plug(&plug); 166 167 return error; 168 } 169 170 struct xfs_trim_cur { 171 xfs_agblock_t start; 172 xfs_extlen_t count; 173 xfs_agblock_t end; 174 xfs_extlen_t minlen; 175 bool by_bno; 176 }; 177 178 static int 179 xfs_trim_gather_extents( 180 struct xfs_perag *pag, 181 struct xfs_trim_cur *tcur, 182 struct xfs_busy_extents *extents) 183 { 184 struct xfs_mount *mp = pag_mount(pag); 185 struct xfs_trans *tp; 186 struct xfs_btree_cur *cur; 187 struct xfs_buf *agbp; 188 int error; 189 int i; 190 int batch = XFS_DISCARD_MAX_EXAMINE; 191 192 /* 193 * Force out the log. This means any transactions that might have freed 194 * space before we take the AGF buffer lock are now on disk, and the 195 * volatile disk cache is flushed. 196 */ 197 xfs_log_force(mp, XFS_LOG_SYNC); 198 199 error = xfs_trans_alloc_empty(mp, &tp); 200 if (error) 201 return error; 202 203 error = xfs_alloc_read_agf(pag, tp, 0, &agbp); 204 if (error) 205 goto out_trans_cancel; 206 207 if (tcur->by_bno) { 208 /* sub-AG discard request always starts at tcur->start */ 209 cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); 210 error = xfs_alloc_lookup_le(cur, tcur->start, 0, &i); 211 if (!error && !i) 212 error = xfs_alloc_lookup_ge(cur, tcur->start, 0, &i); 213 } else if (tcur->start == 0) { 214 /* first time through a by-len starts with max length */ 215 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); 216 error = xfs_alloc_lookup_ge(cur, 0, tcur->count, &i); 217 } else { 218 /* nth time through a by-len starts where we left off */ 219 cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); 220 error = xfs_alloc_lookup_le(cur, tcur->start, tcur->count, &i); 221 } 222 if (error) 223 goto out_del_cursor; 224 if (i == 0) { 225 /* nothing of that length left in the AG, we are done */ 226 tcur->count = 0; 227 goto out_del_cursor; 228 } 229 230 /* 231 * Loop until we are done with all extents that are large 232 * enough to be worth discarding or we hit batch limits. 233 */ 234 while (i) { 235 xfs_agblock_t fbno; 236 xfs_extlen_t flen; 237 238 error = xfs_alloc_get_rec(cur, &fbno, &flen, &i); 239 if (error) 240 break; 241 if (XFS_IS_CORRUPT(mp, i != 1)) { 242 xfs_btree_mark_sick(cur); 243 error = -EFSCORRUPTED; 244 break; 245 } 246 247 if (--batch <= 0) { 248 /* 249 * Update the cursor to point at this extent so we 250 * restart the next batch from this extent. 251 */ 252 tcur->start = fbno; 253 tcur->count = flen; 254 break; 255 } 256 257 /* 258 * If the extent is entirely outside of the range we are 259 * supposed to skip it. Do not bother to trim down partially 260 * overlapping ranges for now. 261 */ 262 if (fbno + flen < tcur->start) { 263 trace_xfs_discard_exclude(pag_group(pag), fbno, flen); 264 goto next_extent; 265 } 266 if (fbno > tcur->end) { 267 trace_xfs_discard_exclude(pag_group(pag), fbno, flen); 268 if (tcur->by_bno) { 269 tcur->count = 0; 270 break; 271 } 272 goto next_extent; 273 } 274 275 /* Trim the extent returned to the range we want. */ 276 if (fbno < tcur->start) { 277 flen -= tcur->start - fbno; 278 fbno = tcur->start; 279 } 280 if (fbno + flen > tcur->end + 1) 281 flen = tcur->end - fbno + 1; 282 283 /* Too small? Give up. */ 284 if (flen < tcur->minlen) { 285 trace_xfs_discard_toosmall(pag_group(pag), fbno, flen); 286 if (tcur->by_bno) 287 goto next_extent; 288 tcur->count = 0; 289 break; 290 } 291 292 /* 293 * If any blocks in the range are still busy, skip the 294 * discard and try again the next time. 295 */ 296 if (xfs_extent_busy_search(pag_group(pag), fbno, flen)) { 297 trace_xfs_discard_busy(pag_group(pag), fbno, flen); 298 goto next_extent; 299 } 300 301 xfs_extent_busy_insert_discard(pag_group(pag), fbno, flen, 302 &extents->extent_list); 303 next_extent: 304 if (tcur->by_bno) 305 error = xfs_btree_increment(cur, 0, &i); 306 else 307 error = xfs_btree_decrement(cur, 0, &i); 308 if (error) 309 break; 310 311 /* 312 * If there's no more records in the tree, we are done. Set the 313 * cursor block count to 0 to indicate to the caller that there 314 * is no more extents to search. 315 */ 316 if (i == 0) 317 tcur->count = 0; 318 } 319 320 /* 321 * If there was an error, release all the gathered busy extents because 322 * we aren't going to issue a discard on them any more. 323 */ 324 if (error) 325 xfs_extent_busy_clear(&extents->extent_list, false); 326 out_del_cursor: 327 xfs_btree_del_cursor(cur, error); 328 out_trans_cancel: 329 xfs_trans_cancel(tp); 330 return error; 331 } 332 333 static bool 334 xfs_trim_should_stop(void) 335 { 336 return fatal_signal_pending(current) || freezing(current); 337 } 338 339 /* 340 * Iterate the free list gathering extents and discarding them. We need a cursor 341 * for the repeated iteration of gather/discard loop, so use the longest extent 342 * we found in the last batch as the key to start the next. 343 */ 344 static int 345 xfs_trim_perag_extents( 346 struct xfs_perag *pag, 347 xfs_agblock_t start, 348 xfs_agblock_t end, 349 xfs_extlen_t minlen) 350 { 351 struct xfs_trim_cur tcur = { 352 .start = start, 353 .count = pag->pagf_longest, 354 .end = end, 355 .minlen = minlen, 356 }; 357 int error = 0; 358 359 if (start != 0 || end != pag_group(pag)->xg_block_count) 360 tcur.by_bno = true; 361 362 do { 363 struct xfs_busy_extents *extents; 364 365 extents = kzalloc(sizeof(*extents), GFP_KERNEL); 366 if (!extents) { 367 error = -ENOMEM; 368 break; 369 } 370 371 extents->owner = extents; 372 INIT_LIST_HEAD(&extents->extent_list); 373 374 error = xfs_trim_gather_extents(pag, &tcur, extents); 375 if (error) { 376 kfree(extents); 377 break; 378 } 379 380 /* 381 * We hand the extent list to the discard function here so the 382 * discarded extents can be removed from the busy extent list. 383 * This allows the discards to run asynchronously with gathering 384 * the next round of extents to discard. 385 * 386 * However, we must ensure that we do not reference the extent 387 * list after this function call, as it may have been freed by 388 * the time control returns to us. 389 */ 390 error = xfs_discard_extents(pag_mount(pag), extents); 391 if (error) 392 break; 393 394 if (xfs_trim_should_stop()) 395 break; 396 397 } while (tcur.count != 0); 398 399 return error; 400 401 } 402 403 static int 404 xfs_trim_datadev_extents( 405 struct xfs_mount *mp, 406 xfs_daddr_t start, 407 xfs_daddr_t end, 408 xfs_extlen_t minlen) 409 { 410 xfs_agnumber_t start_agno, end_agno; 411 xfs_agblock_t start_agbno, end_agbno; 412 struct xfs_perag *pag = NULL; 413 xfs_daddr_t ddev_end; 414 int last_error = 0, error; 415 416 ddev_end = min_t(xfs_daddr_t, end, 417 XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks) - 1); 418 419 start_agno = xfs_daddr_to_agno(mp, start); 420 start_agbno = xfs_daddr_to_agbno(mp, start); 421 end_agno = xfs_daddr_to_agno(mp, ddev_end); 422 end_agbno = xfs_daddr_to_agbno(mp, ddev_end); 423 424 while ((pag = xfs_perag_next_range(mp, pag, start_agno, end_agno))) { 425 xfs_agblock_t agend = pag_group(pag)->xg_block_count; 426 427 if (pag_agno(pag) == end_agno) 428 agend = end_agbno; 429 error = xfs_trim_perag_extents(pag, start_agbno, agend, minlen); 430 if (error) 431 last_error = error; 432 433 if (xfs_trim_should_stop()) { 434 xfs_perag_rele(pag); 435 break; 436 } 437 start_agbno = 0; 438 } 439 440 return last_error; 441 } 442 443 #ifdef CONFIG_XFS_RT 444 struct xfs_trim_rtdev { 445 /* list of rt extents to free */ 446 struct list_head extent_list; 447 448 /* minimum length that caller allows us to trim */ 449 xfs_rtblock_t minlen_fsb; 450 451 /* restart point for the rtbitmap walk */ 452 xfs_rtxnum_t restart_rtx; 453 454 /* stopping point for the current rtbitmap walk */ 455 xfs_rtxnum_t stop_rtx; 456 }; 457 458 struct xfs_rtx_busy { 459 struct list_head list; 460 xfs_rtblock_t bno; 461 xfs_rtblock_t length; 462 }; 463 464 static void 465 xfs_discard_free_rtdev_extents( 466 struct xfs_trim_rtdev *tr) 467 { 468 struct xfs_rtx_busy *busyp, *n; 469 470 list_for_each_entry_safe(busyp, n, &tr->extent_list, list) { 471 list_del_init(&busyp->list); 472 kfree(busyp); 473 } 474 } 475 476 /* 477 * Walk the discard list and issue discards on all the busy extents in the 478 * list. We plug and chain the bios so that we only need a single completion 479 * call to clear all the busy extents once the discards are complete. 480 */ 481 static int 482 xfs_discard_rtdev_extents( 483 struct xfs_mount *mp, 484 struct xfs_trim_rtdev *tr) 485 { 486 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 487 struct xfs_rtx_busy *busyp; 488 struct bio *bio = NULL; 489 struct blk_plug plug; 490 xfs_rtblock_t start = NULLRTBLOCK, length = 0; 491 int error = 0; 492 493 blk_start_plug(&plug); 494 list_for_each_entry(busyp, &tr->extent_list, list) { 495 if (start == NULLRTBLOCK) 496 start = busyp->bno; 497 length += busyp->length; 498 499 trace_xfs_discard_rtextent(mp, busyp->bno, busyp->length); 500 501 error = __blkdev_issue_discard(bdev, 502 xfs_rtb_to_daddr(mp, busyp->bno), 503 XFS_FSB_TO_BB(mp, busyp->length), 504 GFP_NOFS, &bio); 505 if (error) 506 break; 507 } 508 xfs_discard_free_rtdev_extents(tr); 509 510 if (bio) { 511 error = submit_bio_wait(bio); 512 if (error == -EOPNOTSUPP) 513 error = 0; 514 if (error) 515 xfs_info(mp, 516 "discard failed for rtextent [0x%llx,%llu], error %d", 517 (unsigned long long)start, 518 (unsigned long long)length, 519 error); 520 bio_put(bio); 521 } 522 blk_finish_plug(&plug); 523 524 return error; 525 } 526 527 static int 528 xfs_trim_gather_rtextent( 529 struct xfs_rtgroup *rtg, 530 struct xfs_trans *tp, 531 const struct xfs_rtalloc_rec *rec, 532 void *priv) 533 { 534 struct xfs_trim_rtdev *tr = priv; 535 struct xfs_rtx_busy *busyp; 536 xfs_rtblock_t rbno, rlen; 537 538 if (rec->ar_startext > tr->stop_rtx) { 539 /* 540 * If we've scanned a large number of rtbitmap blocks, update 541 * the cursor to point at this extent so we restart the next 542 * batch from this extent. 543 */ 544 tr->restart_rtx = rec->ar_startext; 545 return -ECANCELED; 546 } 547 548 rbno = xfs_rtx_to_rtb(rtg, rec->ar_startext); 549 rlen = xfs_rtbxlen_to_blen(rtg_mount(rtg), rec->ar_extcount); 550 551 /* Ignore too small. */ 552 if (rlen < tr->minlen_fsb) { 553 trace_xfs_discard_rttoosmall(rtg_mount(rtg), rbno, rlen); 554 return 0; 555 } 556 557 busyp = kzalloc(sizeof(struct xfs_rtx_busy), GFP_KERNEL); 558 if (!busyp) 559 return -ENOMEM; 560 561 busyp->bno = rbno; 562 busyp->length = rlen; 563 INIT_LIST_HEAD(&busyp->list); 564 list_add_tail(&busyp->list, &tr->extent_list); 565 566 tr->restart_rtx = rec->ar_startext + rec->ar_extcount; 567 return 0; 568 } 569 570 /* Trim extents on an !rtgroups realtime device */ 571 static int 572 xfs_trim_rtextents( 573 struct xfs_rtgroup *rtg, 574 xfs_rtxnum_t low, 575 xfs_rtxnum_t high, 576 xfs_daddr_t minlen) 577 { 578 struct xfs_mount *mp = rtg_mount(rtg); 579 struct xfs_trim_rtdev tr = { 580 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), 581 .extent_list = LIST_HEAD_INIT(tr.extent_list), 582 }; 583 struct xfs_trans *tp; 584 int error; 585 586 error = xfs_trans_alloc_empty(mp, &tp); 587 if (error) 588 return error; 589 590 /* 591 * Walk the free ranges between low and high. The query_range function 592 * trims the extents returned. 593 */ 594 do { 595 tr.stop_rtx = low + xfs_rtbitmap_rtx_per_rbmblock(mp); 596 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 597 error = xfs_rtalloc_query_range(rtg, tp, low, high, 598 xfs_trim_gather_rtextent, &tr); 599 600 if (error == -ECANCELED) 601 error = 0; 602 if (error) { 603 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 604 xfs_discard_free_rtdev_extents(&tr); 605 break; 606 } 607 608 if (list_empty(&tr.extent_list)) { 609 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 610 break; 611 } 612 613 error = xfs_discard_rtdev_extents(mp, &tr); 614 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 615 if (error) 616 break; 617 618 low = tr.restart_rtx; 619 } while (!xfs_trim_should_stop() && low <= high); 620 621 xfs_trans_cancel(tp); 622 return error; 623 } 624 625 struct xfs_trim_rtgroup { 626 /* list of rtgroup extents to free */ 627 struct xfs_busy_extents *extents; 628 629 /* minimum length that caller allows us to trim */ 630 xfs_rtblock_t minlen_fsb; 631 632 /* restart point for the rtbitmap walk */ 633 xfs_rtxnum_t restart_rtx; 634 635 /* number of extents to examine before stopping to issue discard ios */ 636 int batch; 637 638 /* number of extents queued for discard */ 639 int queued; 640 }; 641 642 static int 643 xfs_trim_gather_rtgroup_extent( 644 struct xfs_rtgroup *rtg, 645 struct xfs_trans *tp, 646 const struct xfs_rtalloc_rec *rec, 647 void *priv) 648 { 649 struct xfs_trim_rtgroup *tr = priv; 650 xfs_rgblock_t rgbno; 651 xfs_extlen_t len; 652 653 if (--tr->batch <= 0) { 654 /* 655 * If we've checked a large number of extents, update the 656 * cursor to point at this extent so we restart the next batch 657 * from this extent. 658 */ 659 tr->restart_rtx = rec->ar_startext; 660 return -ECANCELED; 661 } 662 663 rgbno = xfs_rtx_to_rgbno(rtg, rec->ar_startext); 664 len = xfs_rtxlen_to_extlen(rtg_mount(rtg), rec->ar_extcount); 665 666 /* Ignore too small. */ 667 if (len < tr->minlen_fsb) { 668 trace_xfs_discard_toosmall(rtg_group(rtg), rgbno, len); 669 return 0; 670 } 671 672 /* 673 * If any blocks in the range are still busy, skip the discard and try 674 * again the next time. 675 */ 676 if (xfs_extent_busy_search(rtg_group(rtg), rgbno, len)) { 677 trace_xfs_discard_busy(rtg_group(rtg), rgbno, len); 678 return 0; 679 } 680 681 xfs_extent_busy_insert_discard(rtg_group(rtg), rgbno, len, 682 &tr->extents->extent_list); 683 684 tr->queued++; 685 tr->restart_rtx = rec->ar_startext + rec->ar_extcount; 686 return 0; 687 } 688 689 /* Trim extents in this rtgroup using the busy extent machinery. */ 690 static int 691 xfs_trim_rtgroup_extents( 692 struct xfs_rtgroup *rtg, 693 xfs_rtxnum_t low, 694 xfs_rtxnum_t high, 695 xfs_daddr_t minlen) 696 { 697 struct xfs_mount *mp = rtg_mount(rtg); 698 struct xfs_trim_rtgroup tr = { 699 .minlen_fsb = XFS_BB_TO_FSB(mp, minlen), 700 }; 701 struct xfs_trans *tp; 702 int error; 703 704 error = xfs_trans_alloc_empty(mp, &tp); 705 if (error) 706 return error; 707 708 /* 709 * Walk the free ranges between low and high. The query_range function 710 * trims the extents returned. 711 */ 712 do { 713 tr.extents = kzalloc(sizeof(*tr.extents), GFP_KERNEL); 714 if (!tr.extents) { 715 error = -ENOMEM; 716 break; 717 } 718 719 tr.queued = 0; 720 tr.batch = XFS_DISCARD_MAX_EXAMINE; 721 tr.extents->owner = tr.extents; 722 INIT_LIST_HEAD(&tr.extents->extent_list); 723 724 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 725 error = xfs_rtalloc_query_range(rtg, tp, low, high, 726 xfs_trim_gather_rtgroup_extent, &tr); 727 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_BITMAP_SHARED); 728 if (error == -ECANCELED) 729 error = 0; 730 if (error) { 731 kfree(tr.extents); 732 break; 733 } 734 735 if (!tr.queued) 736 break; 737 738 /* 739 * We hand the extent list to the discard function here so the 740 * discarded extents can be removed from the busy extent list. 741 * This allows the discards to run asynchronously with 742 * gathering the next round of extents to discard. 743 * 744 * However, we must ensure that we do not reference the extent 745 * list after this function call, as it may have been freed by 746 * the time control returns to us. 747 */ 748 error = xfs_discard_extents(rtg_mount(rtg), tr.extents); 749 if (error) 750 break; 751 752 low = tr.restart_rtx; 753 } while (!xfs_trim_should_stop() && low <= high); 754 755 xfs_trans_cancel(tp); 756 return error; 757 } 758 759 static int 760 xfs_trim_rtdev_extents( 761 struct xfs_mount *mp, 762 xfs_daddr_t start, 763 xfs_daddr_t end, 764 xfs_daddr_t minlen) 765 { 766 xfs_rtblock_t start_rtbno, end_rtbno; 767 xfs_rtxnum_t start_rtx, end_rtx; 768 xfs_rgnumber_t start_rgno, end_rgno; 769 xfs_daddr_t daddr_offset; 770 int last_error = 0, error; 771 struct xfs_rtgroup *rtg = NULL; 772 773 /* Shift the start and end downwards to match the rt device. */ 774 daddr_offset = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); 775 if (start > daddr_offset) 776 start -= daddr_offset; 777 else 778 start = 0; 779 start_rtbno = xfs_daddr_to_rtb(mp, start); 780 start_rtx = xfs_rtb_to_rtx(mp, start_rtbno); 781 start_rgno = xfs_rtb_to_rgno(mp, start_rtbno); 782 783 if (end <= daddr_offset) 784 return 0; 785 else 786 end -= daddr_offset; 787 end_rtbno = xfs_daddr_to_rtb(mp, end); 788 end_rtx = xfs_rtb_to_rtx(mp, end_rtbno + mp->m_sb.sb_rextsize - 1); 789 end_rgno = xfs_rtb_to_rgno(mp, end_rtbno); 790 791 while ((rtg = xfs_rtgroup_next_range(mp, rtg, start_rgno, end_rgno))) { 792 xfs_rtxnum_t rtg_end = rtg->rtg_extents; 793 794 if (rtg_rgno(rtg) == end_rgno) 795 rtg_end = min(rtg_end, end_rtx); 796 797 if (xfs_has_rtgroups(mp)) 798 error = xfs_trim_rtgroup_extents(rtg, start_rtx, 799 rtg_end, minlen); 800 else 801 error = xfs_trim_rtextents(rtg, start_rtx, rtg_end, 802 minlen); 803 if (error) 804 last_error = error; 805 806 if (xfs_trim_should_stop()) { 807 xfs_rtgroup_rele(rtg); 808 break; 809 } 810 start_rtx = 0; 811 } 812 813 return last_error; 814 } 815 #else 816 # define xfs_trim_rtdev_extents(...) (-EOPNOTSUPP) 817 #endif /* CONFIG_XFS_RT */ 818 819 /* 820 * trim a range of the filesystem. 821 * 822 * Note: the parameters passed from userspace are byte ranges into the 823 * filesystem which does not match to the format we use for filesystem block 824 * addressing. FSB addressing is sparse (AGNO|AGBNO), while the incoming format 825 * is a linear address range. Hence we need to use DADDR based conversions and 826 * comparisons for determining the correct offset and regions to trim. 827 * 828 * The realtime device is mapped into the FITRIM "address space" immediately 829 * after the data device. 830 */ 831 int 832 xfs_ioc_trim( 833 struct xfs_mount *mp, 834 struct fstrim_range __user *urange) 835 { 836 unsigned int granularity = 837 bdev_discard_granularity(mp->m_ddev_targp->bt_bdev); 838 struct block_device *rt_bdev = NULL; 839 struct fstrim_range range; 840 xfs_daddr_t start, end; 841 xfs_extlen_t minlen; 842 xfs_rfsblock_t max_blocks; 843 int error, last_error = 0; 844 845 if (!capable(CAP_SYS_ADMIN)) 846 return -EPERM; 847 if (mp->m_rtdev_targp && 848 bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev)) 849 rt_bdev = mp->m_rtdev_targp->bt_bdev; 850 if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev) 851 return -EOPNOTSUPP; 852 853 if (rt_bdev) 854 granularity = max(granularity, 855 bdev_discard_granularity(rt_bdev)); 856 857 /* 858 * We haven't recovered the log, so we cannot use our bnobt-guided 859 * storage zapping commands. 860 */ 861 if (xfs_has_norecovery(mp)) 862 return -EROFS; 863 864 if (copy_from_user(&range, urange, sizeof(range))) 865 return -EFAULT; 866 867 range.minlen = max_t(u64, granularity, range.minlen); 868 minlen = XFS_B_TO_FSB(mp, range.minlen); 869 870 /* 871 * Truncating down the len isn't actually quite correct, but using 872 * BBTOB would mean we trivially get overflows for values 873 * of ULLONG_MAX or slightly lower. And ULLONG_MAX is the default 874 * used by the fstrim application. In the end it really doesn't 875 * matter as trimming blocks is an advisory interface. 876 */ 877 max_blocks = mp->m_sb.sb_dblocks + mp->m_sb.sb_rblocks; 878 if (range.start >= XFS_FSB_TO_B(mp, max_blocks) || 879 range.minlen > XFS_FSB_TO_B(mp, mp->m_ag_max_usable) || 880 range.len < mp->m_sb.sb_blocksize) 881 return -EINVAL; 882 883 start = BTOBB(range.start); 884 end = start + BTOBBT(range.len) - 1; 885 886 if (bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev)) { 887 error = xfs_trim_datadev_extents(mp, start, end, minlen); 888 if (error) 889 last_error = error; 890 } 891 892 if (rt_bdev && !xfs_trim_should_stop()) { 893 error = xfs_trim_rtdev_extents(mp, start, end, minlen); 894 if (error) 895 last_error = error; 896 } 897 898 if (last_error) 899 return last_error; 900 901 range.len = min_t(unsigned long long, range.len, 902 XFS_FSB_TO_B(mp, max_blocks) - range.start); 903 if (copy_to_user(urange, &range, sizeof(range))) 904 return -EFAULT; 905 return 0; 906 } 907