1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2023-2025 Christoph Hellwig. 4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. 5 */ 6 #include "xfs.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_inode.h" 13 #include "xfs_btree.h" 14 #include "xfs_trans.h" 15 #include "xfs_icache.h" 16 #include "xfs_rmap.h" 17 #include "xfs_rtbitmap.h" 18 #include "xfs_rtrmap_btree.h" 19 #include "xfs_zone_alloc.h" 20 #include "xfs_zone_priv.h" 21 #include "xfs_zones.h" 22 #include "xfs_trace.h" 23 24 /* 25 * Implement Garbage Collection (GC) of partially used zoned. 26 * 27 * To support the purely sequential writes in each zone, zoned XFS needs to be 28 * able to move data remaining in a zone out of it to reset the zone to prepare 29 * for writing to it again. 30 * 31 * This is done by the GC thread implemented in this file. To support that a 32 * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to 33 * write the garbage collected data into. 34 * 35 * Whenever the available space is below the chosen threshold, the GC thread 36 * looks for potential non-empty but not fully used zones that are worth 37 * reclaiming. Once found the rmap for the victim zone is queried, and after 38 * a bit of sorting to reduce fragmentation, the still live extents are read 39 * into memory and written to the GC target zone, and the bmap btree of the 40 * files is updated to point to the new location. To avoid taking the IOLOCK 41 * and MMAPLOCK for the entire GC process and thus affecting the latency of 42 * user reads and writes to the files, the GC writes are speculative and the 43 * I/O completion checks that no other writes happened for the affected regions 44 * before remapping. 45 * 46 * Once a zone does not contain any valid data, be that through GC or user 47 * block removal, it is queued for for a zone reset. The reset operation 48 * carefully ensures that the RT device cache is flushed and all transactions 49 * referencing the rmap have been committed to disk. 50 */ 51 52 /* 53 * Size of each GC scratch pad. This is also the upper bound for each 54 * GC I/O, which helps to keep latency down. 55 */ 56 #define XFS_GC_CHUNK_SIZE SZ_1M 57 58 /* 59 * Scratchpad data to read GCed data into. 60 * 61 * The offset member tracks where the next allocation starts, and freed tracks 62 * the amount of space that is not used anymore. 63 */ 64 #define XFS_ZONE_GC_NR_SCRATCH 2 65 struct xfs_zone_scratch { 66 struct folio *folio; 67 unsigned int offset; 68 unsigned int freed; 69 }; 70 71 /* 72 * Chunk that is read and written for each GC operation. 73 * 74 * Note that for writes to actual zoned devices, the chunk can be split when 75 * reaching the hardware limit. 76 */ 77 struct xfs_gc_bio { 78 struct xfs_zone_gc_data *data; 79 80 /* 81 * Entry into the reading/writing/resetting list. Only accessed from 82 * the GC thread, so no locking needed. 83 */ 84 struct list_head entry; 85 86 /* 87 * State of this gc_bio. Done means the current I/O completed. 88 * Set from the bio end I/O handler, read from the GC thread. 89 */ 90 enum { 91 XFS_GC_BIO_NEW, 92 XFS_GC_BIO_DONE, 93 } state; 94 95 /* 96 * Pointer to the inode and byte range in the inode that this 97 * GC chunk is operating on. 98 */ 99 struct xfs_inode *ip; 100 loff_t offset; 101 unsigned int len; 102 103 /* 104 * Existing startblock (in the zone to be freed) and newly assigned 105 * daddr in the zone GCed into. 106 */ 107 xfs_fsblock_t old_startblock; 108 xfs_daddr_t new_daddr; 109 struct xfs_zone_scratch *scratch; 110 111 /* Are we writing to a sequential write required zone? */ 112 bool is_seq; 113 114 /* Open Zone being written to */ 115 struct xfs_open_zone *oz; 116 117 struct xfs_rtgroup *victim_rtg; 118 119 /* Bio used for reads and writes, including the bvec used by it */ 120 struct bio_vec bv; 121 struct bio bio; /* must be last */ 122 }; 123 124 #define XFS_ZONE_GC_RECS 1024 125 126 /* iterator, needs to be reinitialized for each victim zone */ 127 struct xfs_zone_gc_iter { 128 struct xfs_rtgroup *victim_rtg; 129 unsigned int rec_count; 130 unsigned int rec_idx; 131 xfs_agblock_t next_startblock; 132 struct xfs_rmap_irec *recs; 133 }; 134 135 /* 136 * Per-mount GC state. 137 */ 138 struct xfs_zone_gc_data { 139 struct xfs_mount *mp; 140 141 /* bioset used to allocate the gc_bios */ 142 struct bio_set bio_set; 143 144 /* 145 * Scratchpad used, and index to indicated which one is used. 146 */ 147 struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH]; 148 unsigned int scratch_idx; 149 150 /* 151 * List of bios currently being read, written and reset. 152 * These lists are only accessed by the GC thread itself, and must only 153 * be processed in order. 154 */ 155 struct list_head reading; 156 struct list_head writing; 157 struct list_head resetting; 158 159 /* 160 * Iterator for the victim zone. 161 */ 162 struct xfs_zone_gc_iter iter; 163 }; 164 165 /* 166 * We aim to keep enough zones free in stock to fully use the open zone limit 167 * for data placement purposes. Additionally, the m_zonegc_low_space tunable 168 * can be set to make sure a fraction of the unused blocks are available for 169 * writing. 170 */ 171 bool 172 xfs_zoned_need_gc( 173 struct xfs_mount *mp) 174 { 175 s64 available, free, threshold; 176 s32 remainder; 177 178 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) 179 return false; 180 181 available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); 182 183 if (available < 184 mp->m_groups[XG_TYPE_RTG].blocks * 185 (mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) 186 return true; 187 188 free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); 189 190 threshold = div_s64_rem(free, 100, &remainder); 191 threshold = threshold * mp->m_zonegc_low_space + 192 remainder * div_s64(mp->m_zonegc_low_space, 100); 193 194 if (available < threshold) 195 return true; 196 197 return false; 198 } 199 200 static struct xfs_zone_gc_data * 201 xfs_zone_gc_data_alloc( 202 struct xfs_mount *mp) 203 { 204 struct xfs_zone_gc_data *data; 205 int i; 206 207 data = kzalloc(sizeof(*data), GFP_KERNEL); 208 if (!data) 209 return NULL; 210 data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs), 211 GFP_KERNEL); 212 if (!data->iter.recs) 213 goto out_free_data; 214 215 /* 216 * We actually only need a single bio_vec. It would be nice to have 217 * a flag that only allocates the inline bvecs and not the separate 218 * bvec pool. 219 */ 220 if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio), 221 BIOSET_NEED_BVECS)) 222 goto out_free_recs; 223 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) { 224 data->scratch[i].folio = 225 folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE)); 226 if (!data->scratch[i].folio) 227 goto out_free_scratch; 228 } 229 INIT_LIST_HEAD(&data->reading); 230 INIT_LIST_HEAD(&data->writing); 231 INIT_LIST_HEAD(&data->resetting); 232 data->mp = mp; 233 return data; 234 235 out_free_scratch: 236 while (--i >= 0) 237 folio_put(data->scratch[i].folio); 238 bioset_exit(&data->bio_set); 239 out_free_recs: 240 kfree(data->iter.recs); 241 out_free_data: 242 kfree(data); 243 return NULL; 244 } 245 246 static void 247 xfs_zone_gc_data_free( 248 struct xfs_zone_gc_data *data) 249 { 250 int i; 251 252 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) 253 folio_put(data->scratch[i].folio); 254 bioset_exit(&data->bio_set); 255 kfree(data->iter.recs); 256 kfree(data); 257 } 258 259 static void 260 xfs_zone_gc_iter_init( 261 struct xfs_zone_gc_iter *iter, 262 struct xfs_rtgroup *victim_rtg) 263 264 { 265 iter->next_startblock = 0; 266 iter->rec_count = 0; 267 iter->rec_idx = 0; 268 iter->victim_rtg = victim_rtg; 269 atomic_inc(&victim_rtg->rtg_gccount); 270 } 271 272 /* 273 * Query the rmap of the victim zone to gather the records to evacuate. 274 */ 275 static int 276 xfs_zone_gc_query_cb( 277 struct xfs_btree_cur *cur, 278 const struct xfs_rmap_irec *irec, 279 void *private) 280 { 281 struct xfs_zone_gc_iter *iter = private; 282 283 ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)); 284 ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner)); 285 ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))); 286 287 iter->recs[iter->rec_count] = *irec; 288 if (++iter->rec_count == XFS_ZONE_GC_RECS) { 289 iter->next_startblock = 290 irec->rm_startblock + irec->rm_blockcount; 291 return 1; 292 } 293 return 0; 294 } 295 296 static int 297 xfs_zone_gc_rmap_rec_cmp( 298 const void *a, 299 const void *b) 300 { 301 const struct xfs_rmap_irec *reca = a; 302 const struct xfs_rmap_irec *recb = b; 303 int diff; 304 305 diff = cmp_int(reca->rm_owner, recb->rm_owner); 306 if (diff) 307 return diff; 308 return cmp_int(reca->rm_offset, recb->rm_offset); 309 } 310 311 static int 312 xfs_zone_gc_query( 313 struct xfs_mount *mp, 314 struct xfs_zone_gc_iter *iter) 315 { 316 struct xfs_rtgroup *rtg = iter->victim_rtg; 317 struct xfs_rmap_irec ri_low = { }; 318 struct xfs_rmap_irec ri_high; 319 struct xfs_btree_cur *cur; 320 struct xfs_trans *tp; 321 int error; 322 323 ASSERT(iter->next_startblock <= rtg_blocks(rtg)); 324 if (iter->next_startblock == rtg_blocks(rtg)) 325 goto done; 326 327 ASSERT(iter->next_startblock < rtg_blocks(rtg)); 328 ri_low.rm_startblock = iter->next_startblock; 329 memset(&ri_high, 0xFF, sizeof(ri_high)); 330 331 iter->rec_idx = 0; 332 iter->rec_count = 0; 333 334 tp = xfs_trans_alloc_empty(mp); 335 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 336 cur = xfs_rtrmapbt_init_cursor(tp, rtg); 337 error = xfs_rmap_query_range(cur, &ri_low, &ri_high, 338 xfs_zone_gc_query_cb, iter); 339 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 340 xfs_btree_del_cursor(cur, error < 0 ? error : 0); 341 xfs_trans_cancel(tp); 342 343 if (error < 0) 344 return error; 345 346 /* 347 * Sort the rmap records by inode number and increasing offset to 348 * defragment the mappings. 349 * 350 * This could be further enhanced by an even bigger look ahead window, 351 * but that's better left until we have better detection of changes to 352 * inode mapping to avoid the potential of GCing already dead data. 353 */ 354 sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]), 355 xfs_zone_gc_rmap_rec_cmp, NULL); 356 357 if (error == 0) { 358 /* 359 * We finished iterating through the zone. 360 */ 361 iter->next_startblock = rtg_blocks(rtg); 362 if (iter->rec_count == 0) 363 goto done; 364 } 365 366 return 0; 367 done: 368 atomic_dec(&iter->victim_rtg->rtg_gccount); 369 xfs_rtgroup_rele(iter->victim_rtg); 370 iter->victim_rtg = NULL; 371 return 0; 372 } 373 374 static bool 375 xfs_zone_gc_iter_next( 376 struct xfs_mount *mp, 377 struct xfs_zone_gc_iter *iter, 378 struct xfs_rmap_irec *chunk_rec, 379 struct xfs_inode **ipp) 380 { 381 struct xfs_rmap_irec *irec; 382 int error; 383 384 if (!iter->victim_rtg) 385 return false; 386 387 retry: 388 if (iter->rec_idx == iter->rec_count) { 389 error = xfs_zone_gc_query(mp, iter); 390 if (error) 391 goto fail; 392 if (!iter->victim_rtg) 393 return false; 394 } 395 396 irec = &iter->recs[iter->rec_idx]; 397 error = xfs_iget(mp, NULL, irec->rm_owner, 398 XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp); 399 if (error) { 400 /* 401 * If the inode was already deleted, skip over it. 402 */ 403 if (error == -ENOENT) { 404 iter->rec_idx++; 405 goto retry; 406 } 407 goto fail; 408 } 409 410 if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) { 411 iter->rec_idx++; 412 xfs_irele(*ipp); 413 goto retry; 414 } 415 416 *chunk_rec = *irec; 417 return true; 418 419 fail: 420 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 421 return false; 422 } 423 424 static void 425 xfs_zone_gc_iter_advance( 426 struct xfs_zone_gc_iter *iter, 427 xfs_extlen_t count_fsb) 428 { 429 struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx]; 430 431 irec->rm_offset += count_fsb; 432 irec->rm_startblock += count_fsb; 433 irec->rm_blockcount -= count_fsb; 434 if (!irec->rm_blockcount) 435 iter->rec_idx++; 436 } 437 438 static struct xfs_rtgroup * 439 xfs_zone_gc_pick_victim_from( 440 struct xfs_mount *mp, 441 uint32_t bucket) 442 { 443 struct xfs_zone_info *zi = mp->m_zone_info; 444 uint32_t victim_used = U32_MAX; 445 struct xfs_rtgroup *victim_rtg = NULL; 446 uint32_t bit; 447 448 if (!zi->zi_used_bucket_entries[bucket]) 449 return NULL; 450 451 for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket], 452 mp->m_sb.sb_rgcount) { 453 struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit); 454 455 if (!rtg) 456 continue; 457 458 /* 459 * If the zone is already undergoing GC, don't pick it again. 460 * 461 * This prevents us from picking one of the zones for which we 462 * already submitted GC I/O, but for which the remapping hasn't 463 * concluded yet. This won't cause data corruption, but 464 * increases write amplification and slows down GC, so this is 465 * a bad thing. 466 */ 467 if (atomic_read(&rtg->rtg_gccount)) { 468 xfs_rtgroup_rele(rtg); 469 continue; 470 } 471 472 /* skip zones that are just waiting for a reset */ 473 if (rtg_rmap(rtg)->i_used_blocks == 0 || 474 rtg_rmap(rtg)->i_used_blocks >= victim_used) { 475 xfs_rtgroup_rele(rtg); 476 continue; 477 } 478 479 if (victim_rtg) 480 xfs_rtgroup_rele(victim_rtg); 481 victim_rtg = rtg; 482 victim_used = rtg_rmap(rtg)->i_used_blocks; 483 484 /* 485 * Any zone that is less than 1 percent used is fair game for 486 * instant reclaim. All of these zones are in the last 487 * bucket, so avoid the expensive division for the zones 488 * in the other buckets. 489 */ 490 if (bucket == 0 && 491 rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100) 492 break; 493 } 494 495 return victim_rtg; 496 } 497 498 /* 499 * Iterate through all zones marked as reclaimable and find a candidate to 500 * reclaim. 501 */ 502 static bool 503 xfs_zone_gc_select_victim( 504 struct xfs_zone_gc_data *data) 505 { 506 struct xfs_zone_gc_iter *iter = &data->iter; 507 struct xfs_mount *mp = data->mp; 508 struct xfs_zone_info *zi = mp->m_zone_info; 509 struct xfs_rtgroup *victim_rtg = NULL; 510 unsigned int bucket; 511 512 spin_lock(&zi->zi_used_buckets_lock); 513 for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { 514 victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); 515 if (victim_rtg) 516 break; 517 } 518 spin_unlock(&zi->zi_used_buckets_lock); 519 520 if (!victim_rtg) 521 return false; 522 523 trace_xfs_zone_gc_select_victim(victim_rtg, bucket); 524 xfs_zone_gc_iter_init(iter, victim_rtg); 525 return true; 526 } 527 528 static struct xfs_open_zone * 529 xfs_zone_gc_steal_open( 530 struct xfs_zone_info *zi) 531 { 532 struct xfs_open_zone *oz, *found = NULL; 533 534 spin_lock(&zi->zi_open_zones_lock); 535 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { 536 if (!found || oz->oz_allocated < found->oz_allocated) 537 found = oz; 538 } 539 540 if (found) { 541 found->oz_is_gc = true; 542 list_del_init(&found->oz_entry); 543 zi->zi_nr_open_zones--; 544 } 545 546 spin_unlock(&zi->zi_open_zones_lock); 547 return found; 548 } 549 550 static struct xfs_open_zone * 551 xfs_zone_gc_select_target( 552 struct xfs_mount *mp) 553 { 554 struct xfs_zone_info *zi = mp->m_zone_info; 555 struct xfs_open_zone *oz = zi->zi_open_gc_zone; 556 557 /* 558 * We need to wait for pending writes to finish. 559 */ 560 if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg)) 561 return NULL; 562 563 ASSERT(zi->zi_nr_open_zones <= 564 mp->m_max_open_zones - XFS_OPEN_GC_ZONES); 565 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 566 if (oz) 567 trace_xfs_zone_gc_target_opened(oz->oz_rtg); 568 spin_lock(&zi->zi_open_zones_lock); 569 zi->zi_open_gc_zone = oz; 570 spin_unlock(&zi->zi_open_zones_lock); 571 return oz; 572 } 573 574 /* 575 * Ensure we have a valid open zone to write the GC data to. 576 * 577 * If the current target zone has space keep writing to it, else first wait for 578 * all pending writes and then pick a new one. 579 */ 580 static struct xfs_open_zone * 581 xfs_zone_gc_ensure_target( 582 struct xfs_mount *mp) 583 { 584 struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; 585 586 if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg)) 587 return xfs_zone_gc_select_target(mp); 588 return oz; 589 } 590 591 static unsigned int 592 xfs_zone_gc_scratch_available( 593 struct xfs_zone_gc_data *data) 594 { 595 return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset; 596 } 597 598 static bool 599 xfs_zone_gc_space_available( 600 struct xfs_zone_gc_data *data) 601 { 602 struct xfs_open_zone *oz; 603 604 oz = xfs_zone_gc_ensure_target(data->mp); 605 if (!oz) 606 return false; 607 return oz->oz_allocated < rtg_blocks(oz->oz_rtg) && 608 xfs_zone_gc_scratch_available(data); 609 } 610 611 static void 612 xfs_zone_gc_end_io( 613 struct bio *bio) 614 { 615 struct xfs_gc_bio *chunk = 616 container_of(bio, struct xfs_gc_bio, bio); 617 struct xfs_zone_gc_data *data = chunk->data; 618 619 WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE); 620 wake_up_process(data->mp->m_zone_info->zi_gc_thread); 621 } 622 623 static struct xfs_open_zone * 624 xfs_zone_gc_alloc_blocks( 625 struct xfs_zone_gc_data *data, 626 xfs_extlen_t *count_fsb, 627 xfs_daddr_t *daddr, 628 bool *is_seq) 629 { 630 struct xfs_mount *mp = data->mp; 631 struct xfs_open_zone *oz; 632 633 oz = xfs_zone_gc_ensure_target(mp); 634 if (!oz) 635 return NULL; 636 637 *count_fsb = min(*count_fsb, 638 XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data))); 639 640 /* 641 * Directly allocate GC blocks from the reserved pool. 642 * 643 * If we'd take them from the normal pool we could be stealing blocks 644 * from a regular writer, which would then have to wait for GC and 645 * deadlock. 646 */ 647 spin_lock(&mp->m_sb_lock); 648 *count_fsb = min(*count_fsb, 649 rtg_blocks(oz->oz_rtg) - oz->oz_allocated); 650 *count_fsb = min3(*count_fsb, 651 mp->m_free[XC_FREE_RTEXTENTS].res_avail, 652 mp->m_free[XC_FREE_RTAVAILABLE].res_avail); 653 mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb; 654 mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb; 655 spin_unlock(&mp->m_sb_lock); 656 657 if (!*count_fsb) 658 return NULL; 659 660 *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0); 661 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); 662 if (!*is_seq) 663 *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated); 664 oz->oz_allocated += *count_fsb; 665 atomic_inc(&oz->oz_ref); 666 return oz; 667 } 668 669 static bool 670 xfs_zone_gc_start_chunk( 671 struct xfs_zone_gc_data *data) 672 { 673 struct xfs_zone_gc_iter *iter = &data->iter; 674 struct xfs_mount *mp = data->mp; 675 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 676 struct xfs_open_zone *oz; 677 struct xfs_rmap_irec irec; 678 struct xfs_gc_bio *chunk; 679 struct xfs_inode *ip; 680 struct bio *bio; 681 xfs_daddr_t daddr; 682 bool is_seq; 683 684 if (xfs_is_shutdown(mp)) 685 return false; 686 687 if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip)) 688 return false; 689 oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, 690 &is_seq); 691 if (!oz) { 692 xfs_irele(ip); 693 return false; 694 } 695 696 bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set); 697 698 chunk = container_of(bio, struct xfs_gc_bio, bio); 699 chunk->ip = ip; 700 chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); 701 chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount); 702 chunk->old_startblock = 703 xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); 704 chunk->new_daddr = daddr; 705 chunk->is_seq = is_seq; 706 chunk->scratch = &data->scratch[data->scratch_idx]; 707 chunk->data = data; 708 chunk->oz = oz; 709 chunk->victim_rtg = iter->victim_rtg; 710 atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); 711 atomic_inc(&chunk->victim_rtg->rtg_gccount); 712 713 bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); 714 bio->bi_end_io = xfs_zone_gc_end_io; 715 bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len, 716 chunk->scratch->offset); 717 chunk->scratch->offset += chunk->len; 718 if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) { 719 data->scratch_idx = 720 (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH; 721 } 722 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 723 list_add_tail(&chunk->entry, &data->reading); 724 xfs_zone_gc_iter_advance(iter, irec.rm_blockcount); 725 726 submit_bio(bio); 727 return true; 728 } 729 730 static void 731 xfs_zone_gc_free_chunk( 732 struct xfs_gc_bio *chunk) 733 { 734 atomic_dec(&chunk->victim_rtg->rtg_gccount); 735 xfs_rtgroup_rele(chunk->victim_rtg); 736 list_del(&chunk->entry); 737 xfs_open_zone_put(chunk->oz); 738 xfs_irele(chunk->ip); 739 bio_put(&chunk->bio); 740 } 741 742 static void 743 xfs_zone_gc_submit_write( 744 struct xfs_zone_gc_data *data, 745 struct xfs_gc_bio *chunk) 746 { 747 if (chunk->is_seq) { 748 chunk->bio.bi_opf &= ~REQ_OP_WRITE; 749 chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND; 750 } 751 chunk->bio.bi_iter.bi_sector = chunk->new_daddr; 752 chunk->bio.bi_end_io = xfs_zone_gc_end_io; 753 submit_bio(&chunk->bio); 754 } 755 756 static struct xfs_gc_bio * 757 xfs_zone_gc_split_write( 758 struct xfs_zone_gc_data *data, 759 struct xfs_gc_bio *chunk) 760 { 761 struct queue_limits *lim = 762 &bdev_get_queue(chunk->bio.bi_bdev)->limits; 763 struct xfs_gc_bio *split_chunk; 764 int split_sectors; 765 unsigned int split_len; 766 struct bio *split; 767 unsigned int nsegs; 768 769 if (!chunk->is_seq) 770 return NULL; 771 772 split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs, 773 lim->max_zone_append_sectors << SECTOR_SHIFT); 774 if (!split_sectors) 775 return NULL; 776 777 /* ensure the split chunk is still block size aligned */ 778 split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT, 779 data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT; 780 split_len = split_sectors << SECTOR_SHIFT; 781 782 split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set); 783 split_chunk = container_of(split, struct xfs_gc_bio, bio); 784 split_chunk->data = data; 785 ihold(VFS_I(chunk->ip)); 786 split_chunk->ip = chunk->ip; 787 split_chunk->is_seq = chunk->is_seq; 788 split_chunk->scratch = chunk->scratch; 789 split_chunk->offset = chunk->offset; 790 split_chunk->len = split_len; 791 split_chunk->old_startblock = chunk->old_startblock; 792 split_chunk->new_daddr = chunk->new_daddr; 793 split_chunk->oz = chunk->oz; 794 atomic_inc(&chunk->oz->oz_ref); 795 796 split_chunk->victim_rtg = chunk->victim_rtg; 797 atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); 798 atomic_inc(&chunk->victim_rtg->rtg_gccount); 799 800 chunk->offset += split_len; 801 chunk->len -= split_len; 802 chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); 803 804 /* add right before the original chunk */ 805 WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW); 806 list_add_tail(&split_chunk->entry, &chunk->entry); 807 return split_chunk; 808 } 809 810 static void 811 xfs_zone_gc_write_chunk( 812 struct xfs_gc_bio *chunk) 813 { 814 struct xfs_zone_gc_data *data = chunk->data; 815 struct xfs_mount *mp = chunk->ip->i_mount; 816 phys_addr_t bvec_paddr = 817 bvec_phys(bio_first_bvec_all(&chunk->bio)); 818 struct xfs_gc_bio *split_chunk; 819 820 if (chunk->bio.bi_status) 821 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 822 if (xfs_is_shutdown(mp)) { 823 xfs_zone_gc_free_chunk(chunk); 824 return; 825 } 826 827 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 828 list_move_tail(&chunk->entry, &data->writing); 829 830 bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE); 831 bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len, 832 offset_in_folio(chunk->scratch->folio, bvec_paddr)); 833 834 while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) 835 xfs_zone_gc_submit_write(data, split_chunk); 836 xfs_zone_gc_submit_write(data, chunk); 837 } 838 839 static void 840 xfs_zone_gc_finish_chunk( 841 struct xfs_gc_bio *chunk) 842 { 843 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 844 struct xfs_inode *ip = chunk->ip; 845 struct xfs_mount *mp = ip->i_mount; 846 int error; 847 848 if (chunk->bio.bi_status) 849 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 850 if (xfs_is_shutdown(mp)) { 851 xfs_zone_gc_free_chunk(chunk); 852 return; 853 } 854 855 chunk->scratch->freed += chunk->len; 856 if (chunk->scratch->freed == chunk->scratch->offset) { 857 chunk->scratch->offset = 0; 858 chunk->scratch->freed = 0; 859 } 860 861 /* 862 * Cycle through the iolock and wait for direct I/O and layouts to 863 * ensure no one is reading from the old mapping before it goes away. 864 * 865 * Note that xfs_zoned_end_io() below checks that no other writer raced 866 * with us to update the mapping by checking that the old startblock 867 * didn't change. 868 */ 869 xfs_ilock(ip, iolock); 870 error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP); 871 if (!error) 872 inode_dio_wait(VFS_I(ip)); 873 xfs_iunlock(ip, iolock); 874 if (error) 875 goto free; 876 877 if (chunk->is_seq) 878 chunk->new_daddr = chunk->bio.bi_iter.bi_sector; 879 error = xfs_zoned_end_io(ip, chunk->offset, chunk->len, 880 chunk->new_daddr, chunk->oz, chunk->old_startblock); 881 free: 882 if (error) 883 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 884 xfs_zone_gc_free_chunk(chunk); 885 } 886 887 static void 888 xfs_zone_gc_finish_reset( 889 struct xfs_gc_bio *chunk) 890 { 891 struct xfs_rtgroup *rtg = chunk->bio.bi_private; 892 struct xfs_mount *mp = rtg_mount(rtg); 893 struct xfs_zone_info *zi = mp->m_zone_info; 894 895 if (chunk->bio.bi_status) { 896 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 897 goto out; 898 } 899 900 xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); 901 atomic_inc(&zi->zi_nr_free_zones); 902 903 xfs_zoned_add_available(mp, rtg_blocks(rtg)); 904 905 wake_up_all(&zi->zi_zone_wait); 906 out: 907 list_del(&chunk->entry); 908 bio_put(&chunk->bio); 909 } 910 911 static bool 912 xfs_zone_gc_prepare_reset( 913 struct bio *bio, 914 struct xfs_rtgroup *rtg) 915 { 916 trace_xfs_zone_reset(rtg); 917 918 ASSERT(rtg_rmap(rtg)->i_used_blocks == 0); 919 bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); 920 if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { 921 if (!bdev_max_discard_sectors(bio->bi_bdev)) 922 return false; 923 bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC; 924 bio->bi_iter.bi_size = 925 XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg)); 926 } 927 928 return true; 929 } 930 931 int 932 xfs_zone_gc_reset_sync( 933 struct xfs_rtgroup *rtg) 934 { 935 int error = 0; 936 struct bio bio; 937 938 bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, 939 REQ_OP_ZONE_RESET); 940 if (xfs_zone_gc_prepare_reset(&bio, rtg)) 941 error = submit_bio_wait(&bio); 942 bio_uninit(&bio); 943 944 return error; 945 } 946 947 static void 948 xfs_zone_gc_reset_zones( 949 struct xfs_zone_gc_data *data, 950 struct xfs_group *reset_list) 951 { 952 struct xfs_group *next = reset_list; 953 954 if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) { 955 xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR); 956 return; 957 } 958 959 do { 960 struct xfs_rtgroup *rtg = to_rtg(next); 961 struct xfs_gc_bio *chunk; 962 struct bio *bio; 963 964 xfs_log_force_inode(rtg_rmap(rtg)); 965 966 next = rtg_group(rtg)->xg_next_reset; 967 rtg_group(rtg)->xg_next_reset = NULL; 968 969 bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, 970 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set); 971 bio->bi_private = rtg; 972 bio->bi_end_io = xfs_zone_gc_end_io; 973 974 chunk = container_of(bio, struct xfs_gc_bio, bio); 975 chunk->data = data; 976 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 977 list_add_tail(&chunk->entry, &data->resetting); 978 979 /* 980 * Also use the bio to drive the state machine when neither 981 * zone reset nor discard is supported to keep things simple. 982 */ 983 if (xfs_zone_gc_prepare_reset(bio, rtg)) 984 submit_bio(bio); 985 else 986 bio_endio(bio); 987 } while (next); 988 } 989 990 static bool 991 xfs_zone_gc_should_start_new_work( 992 struct xfs_zone_gc_data *data) 993 { 994 if (xfs_is_shutdown(data->mp)) 995 return false; 996 if (!xfs_zone_gc_space_available(data)) 997 return false; 998 999 if (!data->iter.victim_rtg) { 1000 if (kthread_should_stop() || kthread_should_park()) 1001 return false; 1002 if (!xfs_zoned_need_gc(data->mp)) 1003 return false; 1004 if (!xfs_zone_gc_select_victim(data)) 1005 return false; 1006 } 1007 1008 return true; 1009 } 1010 1011 /* 1012 * Handle the work to read and write data for GC and to reset the zones, 1013 * including handling all completions. 1014 * 1015 * Note that the order of the chunks is preserved so that we don't undo the 1016 * optimal order established by xfs_zone_gc_query(). 1017 */ 1018 static void 1019 xfs_zone_gc_handle_work( 1020 struct xfs_zone_gc_data *data) 1021 { 1022 struct xfs_zone_info *zi = data->mp->m_zone_info; 1023 struct xfs_gc_bio *chunk, *next; 1024 struct xfs_group *reset_list; 1025 struct blk_plug plug; 1026 1027 spin_lock(&zi->zi_reset_list_lock); 1028 reset_list = zi->zi_reset_list; 1029 zi->zi_reset_list = NULL; 1030 spin_unlock(&zi->zi_reset_list_lock); 1031 1032 if (reset_list) { 1033 set_current_state(TASK_RUNNING); 1034 xfs_zone_gc_reset_zones(data, reset_list); 1035 } 1036 1037 list_for_each_entry_safe(chunk, next, &data->resetting, entry) { 1038 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1039 break; 1040 set_current_state(TASK_RUNNING); 1041 xfs_zone_gc_finish_reset(chunk); 1042 } 1043 1044 list_for_each_entry_safe(chunk, next, &data->writing, entry) { 1045 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1046 break; 1047 set_current_state(TASK_RUNNING); 1048 xfs_zone_gc_finish_chunk(chunk); 1049 } 1050 1051 blk_start_plug(&plug); 1052 list_for_each_entry_safe(chunk, next, &data->reading, entry) { 1053 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1054 break; 1055 set_current_state(TASK_RUNNING); 1056 xfs_zone_gc_write_chunk(chunk); 1057 } 1058 blk_finish_plug(&plug); 1059 1060 if (xfs_zone_gc_should_start_new_work(data)) { 1061 set_current_state(TASK_RUNNING); 1062 blk_start_plug(&plug); 1063 while (xfs_zone_gc_start_chunk(data)) 1064 ; 1065 blk_finish_plug(&plug); 1066 } 1067 } 1068 1069 /* 1070 * Note that the current GC algorithm would break reflinks and thus duplicate 1071 * data that was shared by multiple owners before. Because of that reflinks 1072 * are currently not supported on zoned file systems and can't be created or 1073 * mounted. 1074 */ 1075 static int 1076 xfs_zoned_gcd( 1077 void *private) 1078 { 1079 struct xfs_zone_gc_data *data = private; 1080 struct xfs_mount *mp = data->mp; 1081 struct xfs_zone_info *zi = mp->m_zone_info; 1082 unsigned int nofs_flag; 1083 1084 nofs_flag = memalloc_nofs_save(); 1085 set_freezable(); 1086 1087 for (;;) { 1088 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); 1089 xfs_set_zonegc_running(mp); 1090 1091 xfs_zone_gc_handle_work(data); 1092 1093 /* 1094 * Only sleep if nothing set the state to running. Else check for 1095 * work again as someone might have queued up more work and woken 1096 * us in the meantime. 1097 */ 1098 if (get_current_state() == TASK_RUNNING) { 1099 try_to_freeze(); 1100 continue; 1101 } 1102 1103 if (list_empty(&data->reading) && 1104 list_empty(&data->writing) && 1105 list_empty(&data->resetting) && 1106 !zi->zi_reset_list) { 1107 xfs_clear_zonegc_running(mp); 1108 xfs_zoned_resv_wake_all(mp); 1109 1110 if (kthread_should_stop()) { 1111 __set_current_state(TASK_RUNNING); 1112 break; 1113 } 1114 1115 if (kthread_should_park()) { 1116 __set_current_state(TASK_RUNNING); 1117 kthread_parkme(); 1118 continue; 1119 } 1120 } 1121 1122 schedule(); 1123 } 1124 xfs_clear_zonegc_running(mp); 1125 1126 if (data->iter.victim_rtg) 1127 xfs_rtgroup_rele(data->iter.victim_rtg); 1128 1129 memalloc_nofs_restore(nofs_flag); 1130 xfs_zone_gc_data_free(data); 1131 return 0; 1132 } 1133 1134 void 1135 xfs_zone_gc_start( 1136 struct xfs_mount *mp) 1137 { 1138 if (xfs_has_zoned(mp)) 1139 kthread_unpark(mp->m_zone_info->zi_gc_thread); 1140 } 1141 1142 void 1143 xfs_zone_gc_stop( 1144 struct xfs_mount *mp) 1145 { 1146 if (xfs_has_zoned(mp)) 1147 kthread_park(mp->m_zone_info->zi_gc_thread); 1148 } 1149 1150 int 1151 xfs_zone_gc_mount( 1152 struct xfs_mount *mp) 1153 { 1154 struct xfs_zone_info *zi = mp->m_zone_info; 1155 struct xfs_zone_gc_data *data; 1156 struct xfs_open_zone *oz; 1157 int error; 1158 1159 /* 1160 * If there are no free zones available for GC, pick the open zone with 1161 * the least used space to GC into. This should only happen after an 1162 * unclean shutdown near ENOSPC while GC was ongoing. 1163 * 1164 * We also need to do this for the first gc zone allocation if we 1165 * unmounted while at the open limit. 1166 */ 1167 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || 1168 zi->zi_nr_open_zones == mp->m_max_open_zones) 1169 oz = xfs_zone_gc_steal_open(zi); 1170 else 1171 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 1172 if (!oz) { 1173 xfs_warn(mp, "unable to allocate a zone for gc"); 1174 error = -EIO; 1175 goto out; 1176 } 1177 1178 trace_xfs_zone_gc_target_opened(oz->oz_rtg); 1179 zi->zi_open_gc_zone = oz; 1180 1181 data = xfs_zone_gc_data_alloc(mp); 1182 if (!data) { 1183 error = -ENOMEM; 1184 goto out_put_gc_zone; 1185 } 1186 1187 mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, 1188 "xfs-zone-gc/%s", mp->m_super->s_id); 1189 if (IS_ERR(mp->m_zone_info->zi_gc_thread)) { 1190 xfs_warn(mp, "unable to create zone gc thread"); 1191 error = PTR_ERR(mp->m_zone_info->zi_gc_thread); 1192 goto out_free_gc_data; 1193 } 1194 1195 /* xfs_zone_gc_start will unpark for rw mounts */ 1196 kthread_park(mp->m_zone_info->zi_gc_thread); 1197 return 0; 1198 1199 out_free_gc_data: 1200 kfree(data); 1201 out_put_gc_zone: 1202 xfs_open_zone_put(zi->zi_open_gc_zone); 1203 out: 1204 return error; 1205 } 1206 1207 void 1208 xfs_zone_gc_unmount( 1209 struct xfs_mount *mp) 1210 { 1211 struct xfs_zone_info *zi = mp->m_zone_info; 1212 1213 kthread_stop(zi->zi_gc_thread); 1214 if (zi->zi_open_gc_zone) 1215 xfs_open_zone_put(zi->zi_open_gc_zone); 1216 } 1217