1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2023-2025 Christoph Hellwig. 4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. 5 */ 6 #include "xfs.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_inode.h" 13 #include "xfs_btree.h" 14 #include "xfs_trans.h" 15 #include "xfs_icache.h" 16 #include "xfs_rmap.h" 17 #include "xfs_rtbitmap.h" 18 #include "xfs_rtrmap_btree.h" 19 #include "xfs_zone_alloc.h" 20 #include "xfs_zone_priv.h" 21 #include "xfs_zones.h" 22 #include "xfs_trace.h" 23 24 /* 25 * Implement Garbage Collection (GC) of partially used zoned. 26 * 27 * To support the purely sequential writes in each zone, zoned XFS needs to be 28 * able to move data remaining in a zone out of it to reset the zone to prepare 29 * for writing to it again. 30 * 31 * This is done by the GC thread implemented in this file. To support that a 32 * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to 33 * write the garbage collected data into. 34 * 35 * Whenever the available space is below the chosen threshold, the GC thread 36 * looks for potential non-empty but not fully used zones that are worth 37 * reclaiming. Once found the rmap for the victim zone is queried, and after 38 * a bit of sorting to reduce fragmentation, the still live extents are read 39 * into memory and written to the GC target zone, and the bmap btree of the 40 * files is updated to point to the new location. To avoid taking the IOLOCK 41 * and MMAPLOCK for the entire GC process and thus affecting the latency of 42 * user reads and writes to the files, the GC writes are speculative and the 43 * I/O completion checks that no other writes happened for the affected regions 44 * before remapping. 45 * 46 * Once a zone does not contain any valid data, be that through GC or user 47 * block removal, it is queued for for a zone reset. The reset operation 48 * carefully ensures that the RT device cache is flushed and all transactions 49 * referencing the rmap have been committed to disk. 50 */ 51 52 /* 53 * Size of each GC scratch pad. This is also the upper bound for each 54 * GC I/O, which helps to keep latency down. 55 */ 56 #define XFS_GC_CHUNK_SIZE SZ_1M 57 58 /* 59 * Scratchpad data to read GCed data into. 60 * 61 * The offset member tracks where the next allocation starts, and freed tracks 62 * the amount of space that is not used anymore. 63 */ 64 #define XFS_ZONE_GC_NR_SCRATCH 2 65 struct xfs_zone_scratch { 66 struct folio *folio; 67 unsigned int offset; 68 unsigned int freed; 69 }; 70 71 /* 72 * Chunk that is read and written for each GC operation. 73 * 74 * Note that for writes to actual zoned devices, the chunk can be split when 75 * reaching the hardware limit. 76 */ 77 struct xfs_gc_bio { 78 struct xfs_zone_gc_data *data; 79 80 /* 81 * Entry into the reading/writing/resetting list. Only accessed from 82 * the GC thread, so no locking needed. 83 */ 84 struct list_head entry; 85 86 /* 87 * State of this gc_bio. Done means the current I/O completed. 88 * Set from the bio end I/O handler, read from the GC thread. 89 */ 90 enum { 91 XFS_GC_BIO_NEW, 92 XFS_GC_BIO_DONE, 93 } state; 94 95 /* 96 * Pointer to the inode and byte range in the inode that this 97 * GC chunk is operating on. 98 */ 99 struct xfs_inode *ip; 100 loff_t offset; 101 unsigned int len; 102 103 /* 104 * Existing startblock (in the zone to be freed) and newly assigned 105 * daddr in the zone GCed into. 106 */ 107 xfs_fsblock_t old_startblock; 108 xfs_daddr_t new_daddr; 109 struct xfs_zone_scratch *scratch; 110 111 /* Are we writing to a sequential write required zone? */ 112 bool is_seq; 113 114 /* Open Zone being written to */ 115 struct xfs_open_zone *oz; 116 117 /* Bio used for reads and writes, including the bvec used by it */ 118 struct bio_vec bv; 119 struct bio bio; /* must be last */ 120 }; 121 122 #define XFS_ZONE_GC_RECS 1024 123 124 /* iterator, needs to be reinitialized for each victim zone */ 125 struct xfs_zone_gc_iter { 126 struct xfs_rtgroup *victim_rtg; 127 unsigned int rec_count; 128 unsigned int rec_idx; 129 xfs_agblock_t next_startblock; 130 struct xfs_rmap_irec *recs; 131 }; 132 133 /* 134 * Per-mount GC state. 135 */ 136 struct xfs_zone_gc_data { 137 struct xfs_mount *mp; 138 139 /* bioset used to allocate the gc_bios */ 140 struct bio_set bio_set; 141 142 /* 143 * Scratchpad used, and index to indicated which one is used. 144 */ 145 struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH]; 146 unsigned int scratch_idx; 147 148 /* 149 * List of bios currently being read, written and reset. 150 * These lists are only accessed by the GC thread itself, and must only 151 * be processed in order. 152 */ 153 struct list_head reading; 154 struct list_head writing; 155 struct list_head resetting; 156 157 /* 158 * Iterator for the victim zone. 159 */ 160 struct xfs_zone_gc_iter iter; 161 }; 162 163 /* 164 * We aim to keep enough zones free in stock to fully use the open zone limit 165 * for data placement purposes. Additionally, the m_zonegc_low_space tunable 166 * can be set to make sure a fraction of the unused blocks are available for 167 * writing. 168 */ 169 bool 170 xfs_zoned_need_gc( 171 struct xfs_mount *mp) 172 { 173 s64 available, free, threshold; 174 s32 remainder; 175 176 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) 177 return false; 178 179 available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); 180 181 if (available < 182 mp->m_groups[XG_TYPE_RTG].blocks * 183 (mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) 184 return true; 185 186 free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); 187 188 threshold = div_s64_rem(free, 100, &remainder); 189 threshold = threshold * mp->m_zonegc_low_space + 190 remainder * div_s64(mp->m_zonegc_low_space, 100); 191 192 if (available < threshold) 193 return true; 194 195 return false; 196 } 197 198 static struct xfs_zone_gc_data * 199 xfs_zone_gc_data_alloc( 200 struct xfs_mount *mp) 201 { 202 struct xfs_zone_gc_data *data; 203 int i; 204 205 data = kzalloc(sizeof(*data), GFP_KERNEL); 206 if (!data) 207 return NULL; 208 data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs), 209 GFP_KERNEL); 210 if (!data->iter.recs) 211 goto out_free_data; 212 213 /* 214 * We actually only need a single bio_vec. It would be nice to have 215 * a flag that only allocates the inline bvecs and not the separate 216 * bvec pool. 217 */ 218 if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio), 219 BIOSET_NEED_BVECS)) 220 goto out_free_recs; 221 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) { 222 data->scratch[i].folio = 223 folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE)); 224 if (!data->scratch[i].folio) 225 goto out_free_scratch; 226 } 227 INIT_LIST_HEAD(&data->reading); 228 INIT_LIST_HEAD(&data->writing); 229 INIT_LIST_HEAD(&data->resetting); 230 data->mp = mp; 231 return data; 232 233 out_free_scratch: 234 while (--i >= 0) 235 folio_put(data->scratch[i].folio); 236 bioset_exit(&data->bio_set); 237 out_free_recs: 238 kfree(data->iter.recs); 239 out_free_data: 240 kfree(data); 241 return NULL; 242 } 243 244 static void 245 xfs_zone_gc_data_free( 246 struct xfs_zone_gc_data *data) 247 { 248 int i; 249 250 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) 251 folio_put(data->scratch[i].folio); 252 bioset_exit(&data->bio_set); 253 kfree(data->iter.recs); 254 kfree(data); 255 } 256 257 static void 258 xfs_zone_gc_iter_init( 259 struct xfs_zone_gc_iter *iter, 260 struct xfs_rtgroup *victim_rtg) 261 262 { 263 iter->next_startblock = 0; 264 iter->rec_count = 0; 265 iter->rec_idx = 0; 266 iter->victim_rtg = victim_rtg; 267 } 268 269 /* 270 * Query the rmap of the victim zone to gather the records to evacuate. 271 */ 272 static int 273 xfs_zone_gc_query_cb( 274 struct xfs_btree_cur *cur, 275 const struct xfs_rmap_irec *irec, 276 void *private) 277 { 278 struct xfs_zone_gc_iter *iter = private; 279 280 ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)); 281 ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner)); 282 ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))); 283 284 iter->recs[iter->rec_count] = *irec; 285 if (++iter->rec_count == XFS_ZONE_GC_RECS) { 286 iter->next_startblock = 287 irec->rm_startblock + irec->rm_blockcount; 288 return 1; 289 } 290 return 0; 291 } 292 293 static int 294 xfs_zone_gc_rmap_rec_cmp( 295 const void *a, 296 const void *b) 297 { 298 const struct xfs_rmap_irec *reca = a; 299 const struct xfs_rmap_irec *recb = b; 300 int diff; 301 302 diff = cmp_int(reca->rm_owner, recb->rm_owner); 303 if (diff) 304 return diff; 305 return cmp_int(reca->rm_offset, recb->rm_offset); 306 } 307 308 static int 309 xfs_zone_gc_query( 310 struct xfs_mount *mp, 311 struct xfs_zone_gc_iter *iter) 312 { 313 struct xfs_rtgroup *rtg = iter->victim_rtg; 314 struct xfs_rmap_irec ri_low = { }; 315 struct xfs_rmap_irec ri_high; 316 struct xfs_btree_cur *cur; 317 struct xfs_trans *tp; 318 int error; 319 320 ASSERT(iter->next_startblock <= rtg_blocks(rtg)); 321 if (iter->next_startblock == rtg_blocks(rtg)) 322 goto done; 323 324 ASSERT(iter->next_startblock < rtg_blocks(rtg)); 325 ri_low.rm_startblock = iter->next_startblock; 326 memset(&ri_high, 0xFF, sizeof(ri_high)); 327 328 iter->rec_idx = 0; 329 iter->rec_count = 0; 330 331 tp = xfs_trans_alloc_empty(mp); 332 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 333 cur = xfs_rtrmapbt_init_cursor(tp, rtg); 334 error = xfs_rmap_query_range(cur, &ri_low, &ri_high, 335 xfs_zone_gc_query_cb, iter); 336 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 337 xfs_btree_del_cursor(cur, error < 0 ? error : 0); 338 xfs_trans_cancel(tp); 339 340 if (error < 0) 341 return error; 342 343 /* 344 * Sort the rmap records by inode number and increasing offset to 345 * defragment the mappings. 346 * 347 * This could be further enhanced by an even bigger look ahead window, 348 * but that's better left until we have better detection of changes to 349 * inode mapping to avoid the potential of GCing already dead data. 350 */ 351 sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]), 352 xfs_zone_gc_rmap_rec_cmp, NULL); 353 354 if (error == 0) { 355 /* 356 * We finished iterating through the zone. 357 */ 358 iter->next_startblock = rtg_blocks(rtg); 359 if (iter->rec_count == 0) 360 goto done; 361 } 362 363 return 0; 364 done: 365 xfs_rtgroup_rele(iter->victim_rtg); 366 iter->victim_rtg = NULL; 367 return 0; 368 } 369 370 static bool 371 xfs_zone_gc_iter_next( 372 struct xfs_mount *mp, 373 struct xfs_zone_gc_iter *iter, 374 struct xfs_rmap_irec *chunk_rec, 375 struct xfs_inode **ipp) 376 { 377 struct xfs_rmap_irec *irec; 378 int error; 379 380 if (!iter->victim_rtg) 381 return false; 382 383 retry: 384 if (iter->rec_idx == iter->rec_count) { 385 error = xfs_zone_gc_query(mp, iter); 386 if (error) 387 goto fail; 388 if (!iter->victim_rtg) 389 return false; 390 } 391 392 irec = &iter->recs[iter->rec_idx]; 393 error = xfs_iget(mp, NULL, irec->rm_owner, 394 XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp); 395 if (error) { 396 /* 397 * If the inode was already deleted, skip over it. 398 */ 399 if (error == -ENOENT) { 400 iter->rec_idx++; 401 goto retry; 402 } 403 goto fail; 404 } 405 406 if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) { 407 iter->rec_idx++; 408 xfs_irele(*ipp); 409 goto retry; 410 } 411 412 *chunk_rec = *irec; 413 return true; 414 415 fail: 416 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 417 return false; 418 } 419 420 static void 421 xfs_zone_gc_iter_advance( 422 struct xfs_zone_gc_iter *iter, 423 xfs_extlen_t count_fsb) 424 { 425 struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx]; 426 427 irec->rm_offset += count_fsb; 428 irec->rm_startblock += count_fsb; 429 irec->rm_blockcount -= count_fsb; 430 if (!irec->rm_blockcount) 431 iter->rec_idx++; 432 } 433 434 static struct xfs_rtgroup * 435 xfs_zone_gc_pick_victim_from( 436 struct xfs_mount *mp, 437 uint32_t bucket) 438 { 439 struct xfs_zone_info *zi = mp->m_zone_info; 440 uint32_t victim_used = U32_MAX; 441 struct xfs_rtgroup *victim_rtg = NULL; 442 uint32_t bit; 443 444 if (!zi->zi_used_bucket_entries[bucket]) 445 return NULL; 446 447 for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket], 448 mp->m_sb.sb_rgcount) { 449 struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit); 450 451 if (!rtg) 452 continue; 453 454 /* skip zones that are just waiting for a reset */ 455 if (rtg_rmap(rtg)->i_used_blocks == 0 || 456 rtg_rmap(rtg)->i_used_blocks >= victim_used) { 457 xfs_rtgroup_rele(rtg); 458 continue; 459 } 460 461 if (victim_rtg) 462 xfs_rtgroup_rele(victim_rtg); 463 victim_rtg = rtg; 464 victim_used = rtg_rmap(rtg)->i_used_blocks; 465 466 /* 467 * Any zone that is less than 1 percent used is fair game for 468 * instant reclaim. All of these zones are in the last 469 * bucket, so avoid the expensive division for the zones 470 * in the other buckets. 471 */ 472 if (bucket == 0 && 473 rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100) 474 break; 475 } 476 477 return victim_rtg; 478 } 479 480 /* 481 * Iterate through all zones marked as reclaimable and find a candidate to 482 * reclaim. 483 */ 484 static bool 485 xfs_zone_gc_select_victim( 486 struct xfs_zone_gc_data *data) 487 { 488 struct xfs_zone_gc_iter *iter = &data->iter; 489 struct xfs_mount *mp = data->mp; 490 struct xfs_zone_info *zi = mp->m_zone_info; 491 struct xfs_rtgroup *victim_rtg = NULL; 492 unsigned int bucket; 493 494 spin_lock(&zi->zi_used_buckets_lock); 495 for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { 496 victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); 497 if (victim_rtg) 498 break; 499 } 500 spin_unlock(&zi->zi_used_buckets_lock); 501 502 if (!victim_rtg) 503 return false; 504 505 trace_xfs_zone_gc_select_victim(victim_rtg, bucket); 506 xfs_zone_gc_iter_init(iter, victim_rtg); 507 return true; 508 } 509 510 static struct xfs_open_zone * 511 xfs_zone_gc_steal_open( 512 struct xfs_zone_info *zi) 513 { 514 struct xfs_open_zone *oz, *found = NULL; 515 516 spin_lock(&zi->zi_open_zones_lock); 517 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { 518 if (!found || oz->oz_allocated < found->oz_allocated) 519 found = oz; 520 } 521 522 if (found) { 523 found->oz_is_gc = true; 524 list_del_init(&found->oz_entry); 525 zi->zi_nr_open_zones--; 526 } 527 528 spin_unlock(&zi->zi_open_zones_lock); 529 return found; 530 } 531 532 static struct xfs_open_zone * 533 xfs_zone_gc_select_target( 534 struct xfs_mount *mp) 535 { 536 struct xfs_zone_info *zi = mp->m_zone_info; 537 struct xfs_open_zone *oz = zi->zi_open_gc_zone; 538 539 /* 540 * We need to wait for pending writes to finish. 541 */ 542 if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg)) 543 return NULL; 544 545 ASSERT(zi->zi_nr_open_zones <= 546 mp->m_max_open_zones - XFS_OPEN_GC_ZONES); 547 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 548 if (oz) 549 trace_xfs_zone_gc_target_opened(oz->oz_rtg); 550 spin_lock(&zi->zi_open_zones_lock); 551 zi->zi_open_gc_zone = oz; 552 spin_unlock(&zi->zi_open_zones_lock); 553 return oz; 554 } 555 556 /* 557 * Ensure we have a valid open zone to write the GC data to. 558 * 559 * If the current target zone has space keep writing to it, else first wait for 560 * all pending writes and then pick a new one. 561 */ 562 static struct xfs_open_zone * 563 xfs_zone_gc_ensure_target( 564 struct xfs_mount *mp) 565 { 566 struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; 567 568 if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg)) 569 return xfs_zone_gc_select_target(mp); 570 return oz; 571 } 572 573 static unsigned int 574 xfs_zone_gc_scratch_available( 575 struct xfs_zone_gc_data *data) 576 { 577 return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset; 578 } 579 580 static bool 581 xfs_zone_gc_space_available( 582 struct xfs_zone_gc_data *data) 583 { 584 struct xfs_open_zone *oz; 585 586 oz = xfs_zone_gc_ensure_target(data->mp); 587 if (!oz) 588 return false; 589 return oz->oz_allocated < rtg_blocks(oz->oz_rtg) && 590 xfs_zone_gc_scratch_available(data); 591 } 592 593 static void 594 xfs_zone_gc_end_io( 595 struct bio *bio) 596 { 597 struct xfs_gc_bio *chunk = 598 container_of(bio, struct xfs_gc_bio, bio); 599 struct xfs_zone_gc_data *data = chunk->data; 600 601 WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE); 602 wake_up_process(data->mp->m_zone_info->zi_gc_thread); 603 } 604 605 static struct xfs_open_zone * 606 xfs_zone_gc_alloc_blocks( 607 struct xfs_zone_gc_data *data, 608 xfs_extlen_t *count_fsb, 609 xfs_daddr_t *daddr, 610 bool *is_seq) 611 { 612 struct xfs_mount *mp = data->mp; 613 struct xfs_open_zone *oz; 614 615 oz = xfs_zone_gc_ensure_target(mp); 616 if (!oz) 617 return NULL; 618 619 *count_fsb = min(*count_fsb, 620 XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data))); 621 622 /* 623 * Directly allocate GC blocks from the reserved pool. 624 * 625 * If we'd take them from the normal pool we could be stealing blocks 626 * from a regular writer, which would then have to wait for GC and 627 * deadlock. 628 */ 629 spin_lock(&mp->m_sb_lock); 630 *count_fsb = min(*count_fsb, 631 rtg_blocks(oz->oz_rtg) - oz->oz_allocated); 632 *count_fsb = min3(*count_fsb, 633 mp->m_free[XC_FREE_RTEXTENTS].res_avail, 634 mp->m_free[XC_FREE_RTAVAILABLE].res_avail); 635 mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb; 636 mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb; 637 spin_unlock(&mp->m_sb_lock); 638 639 if (!*count_fsb) 640 return NULL; 641 642 *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0); 643 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); 644 if (!*is_seq) 645 *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated); 646 oz->oz_allocated += *count_fsb; 647 atomic_inc(&oz->oz_ref); 648 return oz; 649 } 650 651 static bool 652 xfs_zone_gc_start_chunk( 653 struct xfs_zone_gc_data *data) 654 { 655 struct xfs_zone_gc_iter *iter = &data->iter; 656 struct xfs_mount *mp = data->mp; 657 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 658 struct xfs_open_zone *oz; 659 struct xfs_rmap_irec irec; 660 struct xfs_gc_bio *chunk; 661 struct xfs_inode *ip; 662 struct bio *bio; 663 xfs_daddr_t daddr; 664 bool is_seq; 665 666 if (xfs_is_shutdown(mp)) 667 return false; 668 669 if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip)) 670 return false; 671 oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, 672 &is_seq); 673 if (!oz) { 674 xfs_irele(ip); 675 return false; 676 } 677 678 bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set); 679 680 chunk = container_of(bio, struct xfs_gc_bio, bio); 681 chunk->ip = ip; 682 chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); 683 chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount); 684 chunk->old_startblock = 685 xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); 686 chunk->new_daddr = daddr; 687 chunk->is_seq = is_seq; 688 chunk->scratch = &data->scratch[data->scratch_idx]; 689 chunk->data = data; 690 chunk->oz = oz; 691 692 bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); 693 bio->bi_end_io = xfs_zone_gc_end_io; 694 bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len, 695 chunk->scratch->offset); 696 chunk->scratch->offset += chunk->len; 697 if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) { 698 data->scratch_idx = 699 (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH; 700 } 701 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 702 list_add_tail(&chunk->entry, &data->reading); 703 xfs_zone_gc_iter_advance(iter, irec.rm_blockcount); 704 705 submit_bio(bio); 706 return true; 707 } 708 709 static void 710 xfs_zone_gc_free_chunk( 711 struct xfs_gc_bio *chunk) 712 { 713 list_del(&chunk->entry); 714 xfs_open_zone_put(chunk->oz); 715 xfs_irele(chunk->ip); 716 bio_put(&chunk->bio); 717 } 718 719 static void 720 xfs_zone_gc_submit_write( 721 struct xfs_zone_gc_data *data, 722 struct xfs_gc_bio *chunk) 723 { 724 if (chunk->is_seq) { 725 chunk->bio.bi_opf &= ~REQ_OP_WRITE; 726 chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND; 727 } 728 chunk->bio.bi_iter.bi_sector = chunk->new_daddr; 729 chunk->bio.bi_end_io = xfs_zone_gc_end_io; 730 submit_bio(&chunk->bio); 731 } 732 733 static struct xfs_gc_bio * 734 xfs_zone_gc_split_write( 735 struct xfs_zone_gc_data *data, 736 struct xfs_gc_bio *chunk) 737 { 738 struct queue_limits *lim = 739 &bdev_get_queue(chunk->bio.bi_bdev)->limits; 740 struct xfs_gc_bio *split_chunk; 741 int split_sectors; 742 unsigned int split_len; 743 struct bio *split; 744 unsigned int nsegs; 745 746 if (!chunk->is_seq) 747 return NULL; 748 749 split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs, 750 lim->max_zone_append_sectors << SECTOR_SHIFT); 751 if (!split_sectors) 752 return NULL; 753 754 /* ensure the split chunk is still block size aligned */ 755 split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT, 756 data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT; 757 split_len = split_sectors << SECTOR_SHIFT; 758 759 split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set); 760 split_chunk = container_of(split, struct xfs_gc_bio, bio); 761 split_chunk->data = data; 762 ihold(VFS_I(chunk->ip)); 763 split_chunk->ip = chunk->ip; 764 split_chunk->is_seq = chunk->is_seq; 765 split_chunk->scratch = chunk->scratch; 766 split_chunk->offset = chunk->offset; 767 split_chunk->len = split_len; 768 split_chunk->old_startblock = chunk->old_startblock; 769 split_chunk->new_daddr = chunk->new_daddr; 770 split_chunk->oz = chunk->oz; 771 atomic_inc(&chunk->oz->oz_ref); 772 773 chunk->offset += split_len; 774 chunk->len -= split_len; 775 chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); 776 777 /* add right before the original chunk */ 778 WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW); 779 list_add_tail(&split_chunk->entry, &chunk->entry); 780 return split_chunk; 781 } 782 783 static void 784 xfs_zone_gc_write_chunk( 785 struct xfs_gc_bio *chunk) 786 { 787 struct xfs_zone_gc_data *data = chunk->data; 788 struct xfs_mount *mp = chunk->ip->i_mount; 789 phys_addr_t bvec_paddr = 790 bvec_phys(bio_first_bvec_all(&chunk->bio)); 791 struct xfs_gc_bio *split_chunk; 792 793 if (chunk->bio.bi_status) 794 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 795 if (xfs_is_shutdown(mp)) { 796 xfs_zone_gc_free_chunk(chunk); 797 return; 798 } 799 800 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 801 list_move_tail(&chunk->entry, &data->writing); 802 803 bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE); 804 bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len, 805 offset_in_folio(chunk->scratch->folio, bvec_paddr)); 806 807 while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) 808 xfs_zone_gc_submit_write(data, split_chunk); 809 xfs_zone_gc_submit_write(data, chunk); 810 } 811 812 static void 813 xfs_zone_gc_finish_chunk( 814 struct xfs_gc_bio *chunk) 815 { 816 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 817 struct xfs_inode *ip = chunk->ip; 818 struct xfs_mount *mp = ip->i_mount; 819 int error; 820 821 if (chunk->bio.bi_status) 822 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 823 if (xfs_is_shutdown(mp)) { 824 xfs_zone_gc_free_chunk(chunk); 825 return; 826 } 827 828 chunk->scratch->freed += chunk->len; 829 if (chunk->scratch->freed == chunk->scratch->offset) { 830 chunk->scratch->offset = 0; 831 chunk->scratch->freed = 0; 832 } 833 834 /* 835 * Cycle through the iolock and wait for direct I/O and layouts to 836 * ensure no one is reading from the old mapping before it goes away. 837 * 838 * Note that xfs_zoned_end_io() below checks that no other writer raced 839 * with us to update the mapping by checking that the old startblock 840 * didn't change. 841 */ 842 xfs_ilock(ip, iolock); 843 error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP); 844 if (!error) 845 inode_dio_wait(VFS_I(ip)); 846 xfs_iunlock(ip, iolock); 847 if (error) 848 goto free; 849 850 if (chunk->is_seq) 851 chunk->new_daddr = chunk->bio.bi_iter.bi_sector; 852 error = xfs_zoned_end_io(ip, chunk->offset, chunk->len, 853 chunk->new_daddr, chunk->oz, chunk->old_startblock); 854 free: 855 if (error) 856 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 857 xfs_zone_gc_free_chunk(chunk); 858 } 859 860 static void 861 xfs_zone_gc_finish_reset( 862 struct xfs_gc_bio *chunk) 863 { 864 struct xfs_rtgroup *rtg = chunk->bio.bi_private; 865 struct xfs_mount *mp = rtg_mount(rtg); 866 struct xfs_zone_info *zi = mp->m_zone_info; 867 868 if (chunk->bio.bi_status) { 869 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 870 goto out; 871 } 872 873 xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); 874 atomic_inc(&zi->zi_nr_free_zones); 875 876 xfs_zoned_add_available(mp, rtg_blocks(rtg)); 877 878 wake_up_all(&zi->zi_zone_wait); 879 out: 880 list_del(&chunk->entry); 881 bio_put(&chunk->bio); 882 } 883 884 static bool 885 xfs_zone_gc_prepare_reset( 886 struct bio *bio, 887 struct xfs_rtgroup *rtg) 888 { 889 trace_xfs_zone_reset(rtg); 890 891 ASSERT(rtg_rmap(rtg)->i_used_blocks == 0); 892 bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); 893 if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { 894 if (!bdev_max_discard_sectors(bio->bi_bdev)) 895 return false; 896 bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC; 897 bio->bi_iter.bi_size = 898 XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg)); 899 } 900 901 return true; 902 } 903 904 int 905 xfs_zone_gc_reset_sync( 906 struct xfs_rtgroup *rtg) 907 { 908 int error = 0; 909 struct bio bio; 910 911 bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, 912 REQ_OP_ZONE_RESET); 913 if (xfs_zone_gc_prepare_reset(&bio, rtg)) 914 error = submit_bio_wait(&bio); 915 bio_uninit(&bio); 916 917 return error; 918 } 919 920 static void 921 xfs_zone_gc_reset_zones( 922 struct xfs_zone_gc_data *data, 923 struct xfs_group *reset_list) 924 { 925 struct xfs_group *next = reset_list; 926 927 if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) { 928 xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR); 929 return; 930 } 931 932 do { 933 struct xfs_rtgroup *rtg = to_rtg(next); 934 struct xfs_gc_bio *chunk; 935 struct bio *bio; 936 937 xfs_log_force_inode(rtg_rmap(rtg)); 938 939 next = rtg_group(rtg)->xg_next_reset; 940 rtg_group(rtg)->xg_next_reset = NULL; 941 942 bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, 943 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set); 944 bio->bi_private = rtg; 945 bio->bi_end_io = xfs_zone_gc_end_io; 946 947 chunk = container_of(bio, struct xfs_gc_bio, bio); 948 chunk->data = data; 949 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 950 list_add_tail(&chunk->entry, &data->resetting); 951 952 /* 953 * Also use the bio to drive the state machine when neither 954 * zone reset nor discard is supported to keep things simple. 955 */ 956 if (xfs_zone_gc_prepare_reset(bio, rtg)) 957 submit_bio(bio); 958 else 959 bio_endio(bio); 960 } while (next); 961 } 962 963 static bool 964 xfs_zone_gc_should_start_new_work( 965 struct xfs_zone_gc_data *data) 966 { 967 if (xfs_is_shutdown(data->mp)) 968 return false; 969 if (!xfs_zone_gc_space_available(data)) 970 return false; 971 972 if (!data->iter.victim_rtg) { 973 if (kthread_should_stop() || kthread_should_park()) 974 return false; 975 if (!xfs_zoned_need_gc(data->mp)) 976 return false; 977 if (!xfs_zone_gc_select_victim(data)) 978 return false; 979 } 980 981 return true; 982 } 983 984 /* 985 * Handle the work to read and write data for GC and to reset the zones, 986 * including handling all completions. 987 * 988 * Note that the order of the chunks is preserved so that we don't undo the 989 * optimal order established by xfs_zone_gc_query(). 990 */ 991 static void 992 xfs_zone_gc_handle_work( 993 struct xfs_zone_gc_data *data) 994 { 995 struct xfs_zone_info *zi = data->mp->m_zone_info; 996 struct xfs_gc_bio *chunk, *next; 997 struct xfs_group *reset_list; 998 struct blk_plug plug; 999 1000 spin_lock(&zi->zi_reset_list_lock); 1001 reset_list = zi->zi_reset_list; 1002 zi->zi_reset_list = NULL; 1003 spin_unlock(&zi->zi_reset_list_lock); 1004 1005 if (reset_list) { 1006 set_current_state(TASK_RUNNING); 1007 xfs_zone_gc_reset_zones(data, reset_list); 1008 } 1009 1010 list_for_each_entry_safe(chunk, next, &data->resetting, entry) { 1011 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1012 break; 1013 set_current_state(TASK_RUNNING); 1014 xfs_zone_gc_finish_reset(chunk); 1015 } 1016 1017 list_for_each_entry_safe(chunk, next, &data->writing, entry) { 1018 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1019 break; 1020 set_current_state(TASK_RUNNING); 1021 xfs_zone_gc_finish_chunk(chunk); 1022 } 1023 1024 blk_start_plug(&plug); 1025 list_for_each_entry_safe(chunk, next, &data->reading, entry) { 1026 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1027 break; 1028 set_current_state(TASK_RUNNING); 1029 xfs_zone_gc_write_chunk(chunk); 1030 } 1031 blk_finish_plug(&plug); 1032 1033 if (xfs_zone_gc_should_start_new_work(data)) { 1034 set_current_state(TASK_RUNNING); 1035 blk_start_plug(&plug); 1036 while (xfs_zone_gc_start_chunk(data)) 1037 ; 1038 blk_finish_plug(&plug); 1039 } 1040 } 1041 1042 /* 1043 * Note that the current GC algorithm would break reflinks and thus duplicate 1044 * data that was shared by multiple owners before. Because of that reflinks 1045 * are currently not supported on zoned file systems and can't be created or 1046 * mounted. 1047 */ 1048 static int 1049 xfs_zoned_gcd( 1050 void *private) 1051 { 1052 struct xfs_zone_gc_data *data = private; 1053 struct xfs_mount *mp = data->mp; 1054 struct xfs_zone_info *zi = mp->m_zone_info; 1055 unsigned int nofs_flag; 1056 1057 nofs_flag = memalloc_nofs_save(); 1058 set_freezable(); 1059 1060 for (;;) { 1061 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); 1062 xfs_set_zonegc_running(mp); 1063 1064 xfs_zone_gc_handle_work(data); 1065 1066 /* 1067 * Only sleep if nothing set the state to running. Else check for 1068 * work again as someone might have queued up more work and woken 1069 * us in the meantime. 1070 */ 1071 if (get_current_state() == TASK_RUNNING) { 1072 try_to_freeze(); 1073 continue; 1074 } 1075 1076 if (list_empty(&data->reading) && 1077 list_empty(&data->writing) && 1078 list_empty(&data->resetting) && 1079 !zi->zi_reset_list) { 1080 xfs_clear_zonegc_running(mp); 1081 xfs_zoned_resv_wake_all(mp); 1082 1083 if (kthread_should_stop()) { 1084 __set_current_state(TASK_RUNNING); 1085 break; 1086 } 1087 1088 if (kthread_should_park()) { 1089 __set_current_state(TASK_RUNNING); 1090 kthread_parkme(); 1091 continue; 1092 } 1093 } 1094 1095 schedule(); 1096 } 1097 xfs_clear_zonegc_running(mp); 1098 1099 if (data->iter.victim_rtg) 1100 xfs_rtgroup_rele(data->iter.victim_rtg); 1101 1102 memalloc_nofs_restore(nofs_flag); 1103 xfs_zone_gc_data_free(data); 1104 return 0; 1105 } 1106 1107 void 1108 xfs_zone_gc_start( 1109 struct xfs_mount *mp) 1110 { 1111 if (xfs_has_zoned(mp)) 1112 kthread_unpark(mp->m_zone_info->zi_gc_thread); 1113 } 1114 1115 void 1116 xfs_zone_gc_stop( 1117 struct xfs_mount *mp) 1118 { 1119 if (xfs_has_zoned(mp)) 1120 kthread_park(mp->m_zone_info->zi_gc_thread); 1121 } 1122 1123 int 1124 xfs_zone_gc_mount( 1125 struct xfs_mount *mp) 1126 { 1127 struct xfs_zone_info *zi = mp->m_zone_info; 1128 struct xfs_zone_gc_data *data; 1129 struct xfs_open_zone *oz; 1130 int error; 1131 1132 /* 1133 * If there are no free zones available for GC, pick the open zone with 1134 * the least used space to GC into. This should only happen after an 1135 * unclean shutdown near ENOSPC while GC was ongoing. 1136 * 1137 * We also need to do this for the first gc zone allocation if we 1138 * unmounted while at the open limit. 1139 */ 1140 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || 1141 zi->zi_nr_open_zones == mp->m_max_open_zones) 1142 oz = xfs_zone_gc_steal_open(zi); 1143 else 1144 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 1145 if (!oz) { 1146 xfs_warn(mp, "unable to allocate a zone for gc"); 1147 error = -EIO; 1148 goto out; 1149 } 1150 1151 trace_xfs_zone_gc_target_opened(oz->oz_rtg); 1152 zi->zi_open_gc_zone = oz; 1153 1154 data = xfs_zone_gc_data_alloc(mp); 1155 if (!data) { 1156 error = -ENOMEM; 1157 goto out_put_gc_zone; 1158 } 1159 1160 mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, 1161 "xfs-zone-gc/%s", mp->m_super->s_id); 1162 if (IS_ERR(mp->m_zone_info->zi_gc_thread)) { 1163 xfs_warn(mp, "unable to create zone gc thread"); 1164 error = PTR_ERR(mp->m_zone_info->zi_gc_thread); 1165 goto out_free_gc_data; 1166 } 1167 1168 /* xfs_zone_gc_start will unpark for rw mounts */ 1169 kthread_park(mp->m_zone_info->zi_gc_thread); 1170 return 0; 1171 1172 out_free_gc_data: 1173 kfree(data); 1174 out_put_gc_zone: 1175 xfs_open_zone_put(zi->zi_open_gc_zone); 1176 out: 1177 return error; 1178 } 1179 1180 void 1181 xfs_zone_gc_unmount( 1182 struct xfs_mount *mp) 1183 { 1184 struct xfs_zone_info *zi = mp->m_zone_info; 1185 1186 kthread_stop(zi->zi_gc_thread); 1187 if (zi->zi_open_gc_zone) 1188 xfs_open_zone_put(zi->zi_open_gc_zone); 1189 } 1190