1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2023-2025 Christoph Hellwig. 4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. 5 */ 6 #include "xfs.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_inode.h" 13 #include "xfs_btree.h" 14 #include "xfs_trans.h" 15 #include "xfs_icache.h" 16 #include "xfs_rmap.h" 17 #include "xfs_rtbitmap.h" 18 #include "xfs_rtrmap_btree.h" 19 #include "xfs_zone_alloc.h" 20 #include "xfs_zone_priv.h" 21 #include "xfs_zones.h" 22 #include "xfs_trace.h" 23 24 /* 25 * Implement Garbage Collection (GC) of partially used zoned. 26 * 27 * To support the purely sequential writes in each zone, zoned XFS needs to be 28 * able to move data remaining in a zone out of it to reset the zone to prepare 29 * for writing to it again. 30 * 31 * This is done by the GC thread implemented in this file. To support that a 32 * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to 33 * write the garbage collected data into. 34 * 35 * Whenever the available space is below the chosen threshold, the GC thread 36 * looks for potential non-empty but not fully used zones that are worth 37 * reclaiming. Once found the rmap for the victim zone is queried, and after 38 * a bit of sorting to reduce fragmentation, the still live extents are read 39 * into memory and written to the GC target zone, and the bmap btree of the 40 * files is updated to point to the new location. To avoid taking the IOLOCK 41 * and MMAPLOCK for the entire GC process and thus affecting the latency of 42 * user reads and writes to the files, the GC writes are speculative and the 43 * I/O completion checks that no other writes happened for the affected regions 44 * before remapping. 45 * 46 * Once a zone does not contain any valid data, be that through GC or user 47 * block removal, it is queued for for a zone reset. The reset operation 48 * carefully ensures that the RT device cache is flushed and all transactions 49 * referencing the rmap have been committed to disk. 50 */ 51 52 /* 53 * Size of each GC scratch pad. This is also the upper bound for each 54 * GC I/O, which helps to keep latency down. 55 */ 56 #define XFS_GC_CHUNK_SIZE SZ_1M 57 58 /* 59 * Scratchpad data to read GCed data into. 60 * 61 * The offset member tracks where the next allocation starts, and freed tracks 62 * the amount of space that is not used anymore. 63 */ 64 #define XFS_ZONE_GC_NR_SCRATCH 2 65 struct xfs_zone_scratch { 66 struct folio *folio; 67 unsigned int offset; 68 unsigned int freed; 69 }; 70 71 /* 72 * Chunk that is read and written for each GC operation. 73 * 74 * Note that for writes to actual zoned devices, the chunk can be split when 75 * reaching the hardware limit. 76 */ 77 struct xfs_gc_bio { 78 struct xfs_zone_gc_data *data; 79 80 /* 81 * Entry into the reading/writing/resetting list. Only accessed from 82 * the GC thread, so no locking needed. 83 */ 84 struct list_head entry; 85 86 /* 87 * State of this gc_bio. Done means the current I/O completed. 88 * Set from the bio end I/O handler, read from the GC thread. 89 */ 90 enum { 91 XFS_GC_BIO_NEW, 92 XFS_GC_BIO_DONE, 93 } state; 94 95 /* 96 * Pointer to the inode and byte range in the inode that this 97 * GC chunk is operating on. 98 */ 99 struct xfs_inode *ip; 100 loff_t offset; 101 unsigned int len; 102 103 /* 104 * Existing startblock (in the zone to be freed) and newly assigned 105 * daddr in the zone GCed into. 106 */ 107 xfs_fsblock_t old_startblock; 108 xfs_daddr_t new_daddr; 109 struct xfs_zone_scratch *scratch; 110 111 /* Are we writing to a sequential write required zone? */ 112 bool is_seq; 113 114 /* Open Zone being written to */ 115 struct xfs_open_zone *oz; 116 117 /* Bio used for reads and writes, including the bvec used by it */ 118 struct bio_vec bv; 119 struct bio bio; /* must be last */ 120 }; 121 122 #define XFS_ZONE_GC_RECS 1024 123 124 /* iterator, needs to be reinitialized for each victim zone */ 125 struct xfs_zone_gc_iter { 126 struct xfs_rtgroup *victim_rtg; 127 unsigned int rec_count; 128 unsigned int rec_idx; 129 xfs_agblock_t next_startblock; 130 struct xfs_rmap_irec *recs; 131 }; 132 133 /* 134 * Per-mount GC state. 135 */ 136 struct xfs_zone_gc_data { 137 struct xfs_mount *mp; 138 139 /* bioset used to allocate the gc_bios */ 140 struct bio_set bio_set; 141 142 /* 143 * Scratchpad used, and index to indicated which one is used. 144 */ 145 struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH]; 146 unsigned int scratch_idx; 147 148 /* 149 * List of bios currently being read, written and reset. 150 * These lists are only accessed by the GC thread itself, and must only 151 * be processed in order. 152 */ 153 struct list_head reading; 154 struct list_head writing; 155 struct list_head resetting; 156 157 /* 158 * Iterator for the victim zone. 159 */ 160 struct xfs_zone_gc_iter iter; 161 }; 162 163 /* 164 * We aim to keep enough zones free in stock to fully use the open zone limit 165 * for data placement purposes. 166 */ 167 bool 168 xfs_zoned_need_gc( 169 struct xfs_mount *mp) 170 { 171 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) 172 return false; 173 if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) < 174 mp->m_groups[XG_TYPE_RTG].blocks * 175 (mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) 176 return true; 177 return false; 178 } 179 180 static struct xfs_zone_gc_data * 181 xfs_zone_gc_data_alloc( 182 struct xfs_mount *mp) 183 { 184 struct xfs_zone_gc_data *data; 185 int i; 186 187 data = kzalloc(sizeof(*data), GFP_KERNEL); 188 if (!data) 189 return NULL; 190 data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs), 191 GFP_KERNEL); 192 if (!data->iter.recs) 193 goto out_free_data; 194 195 /* 196 * We actually only need a single bio_vec. It would be nice to have 197 * a flag that only allocates the inline bvecs and not the separate 198 * bvec pool. 199 */ 200 if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio), 201 BIOSET_NEED_BVECS)) 202 goto out_free_recs; 203 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) { 204 data->scratch[i].folio = 205 folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE)); 206 if (!data->scratch[i].folio) 207 goto out_free_scratch; 208 } 209 INIT_LIST_HEAD(&data->reading); 210 INIT_LIST_HEAD(&data->writing); 211 INIT_LIST_HEAD(&data->resetting); 212 data->mp = mp; 213 return data; 214 215 out_free_scratch: 216 while (--i >= 0) 217 folio_put(data->scratch[i].folio); 218 bioset_exit(&data->bio_set); 219 out_free_recs: 220 kfree(data->iter.recs); 221 out_free_data: 222 kfree(data); 223 return NULL; 224 } 225 226 static void 227 xfs_zone_gc_data_free( 228 struct xfs_zone_gc_data *data) 229 { 230 int i; 231 232 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) 233 folio_put(data->scratch[i].folio); 234 bioset_exit(&data->bio_set); 235 kfree(data->iter.recs); 236 kfree(data); 237 } 238 239 static void 240 xfs_zone_gc_iter_init( 241 struct xfs_zone_gc_iter *iter, 242 struct xfs_rtgroup *victim_rtg) 243 244 { 245 iter->next_startblock = 0; 246 iter->rec_count = 0; 247 iter->rec_idx = 0; 248 iter->victim_rtg = victim_rtg; 249 } 250 251 /* 252 * Query the rmap of the victim zone to gather the records to evacuate. 253 */ 254 static int 255 xfs_zone_gc_query_cb( 256 struct xfs_btree_cur *cur, 257 const struct xfs_rmap_irec *irec, 258 void *private) 259 { 260 struct xfs_zone_gc_iter *iter = private; 261 262 ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)); 263 ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner)); 264 ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))); 265 266 iter->recs[iter->rec_count] = *irec; 267 if (++iter->rec_count == XFS_ZONE_GC_RECS) { 268 iter->next_startblock = 269 irec->rm_startblock + irec->rm_blockcount; 270 return 1; 271 } 272 return 0; 273 } 274 275 #define cmp_int(l, r) ((l > r) - (l < r)) 276 277 static int 278 xfs_zone_gc_rmap_rec_cmp( 279 const void *a, 280 const void *b) 281 { 282 const struct xfs_rmap_irec *reca = a; 283 const struct xfs_rmap_irec *recb = b; 284 int diff; 285 286 diff = cmp_int(reca->rm_owner, recb->rm_owner); 287 if (diff) 288 return diff; 289 return cmp_int(reca->rm_offset, recb->rm_offset); 290 } 291 292 static int 293 xfs_zone_gc_query( 294 struct xfs_mount *mp, 295 struct xfs_zone_gc_iter *iter) 296 { 297 struct xfs_rtgroup *rtg = iter->victim_rtg; 298 struct xfs_rmap_irec ri_low = { }; 299 struct xfs_rmap_irec ri_high; 300 struct xfs_btree_cur *cur; 301 struct xfs_trans *tp; 302 int error; 303 304 ASSERT(iter->next_startblock <= rtg_blocks(rtg)); 305 if (iter->next_startblock == rtg_blocks(rtg)) 306 goto done; 307 308 ASSERT(iter->next_startblock < rtg_blocks(rtg)); 309 ri_low.rm_startblock = iter->next_startblock; 310 memset(&ri_high, 0xFF, sizeof(ri_high)); 311 312 iter->rec_idx = 0; 313 iter->rec_count = 0; 314 315 error = xfs_trans_alloc_empty(mp, &tp); 316 if (error) 317 return error; 318 319 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 320 cur = xfs_rtrmapbt_init_cursor(tp, rtg); 321 error = xfs_rmap_query_range(cur, &ri_low, &ri_high, 322 xfs_zone_gc_query_cb, iter); 323 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 324 xfs_btree_del_cursor(cur, error < 0 ? error : 0); 325 xfs_trans_cancel(tp); 326 327 if (error < 0) 328 return error; 329 330 /* 331 * Sort the rmap records by inode number and increasing offset to 332 * defragment the mappings. 333 * 334 * This could be further enhanced by an even bigger look ahead window, 335 * but that's better left until we have better detection of changes to 336 * inode mapping to avoid the potential of GCing already dead data. 337 */ 338 sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]), 339 xfs_zone_gc_rmap_rec_cmp, NULL); 340 341 if (error == 0) { 342 /* 343 * We finished iterating through the zone. 344 */ 345 iter->next_startblock = rtg_blocks(rtg); 346 if (iter->rec_count == 0) 347 goto done; 348 } 349 350 return 0; 351 done: 352 xfs_rtgroup_rele(iter->victim_rtg); 353 iter->victim_rtg = NULL; 354 return 0; 355 } 356 357 static bool 358 xfs_zone_gc_iter_next( 359 struct xfs_mount *mp, 360 struct xfs_zone_gc_iter *iter, 361 struct xfs_rmap_irec *chunk_rec, 362 struct xfs_inode **ipp) 363 { 364 struct xfs_rmap_irec *irec; 365 int error; 366 367 if (!iter->victim_rtg) 368 return false; 369 370 retry: 371 if (iter->rec_idx == iter->rec_count) { 372 error = xfs_zone_gc_query(mp, iter); 373 if (error) 374 goto fail; 375 if (!iter->victim_rtg) 376 return false; 377 } 378 379 irec = &iter->recs[iter->rec_idx]; 380 error = xfs_iget(mp, NULL, irec->rm_owner, 381 XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp); 382 if (error) { 383 /* 384 * If the inode was already deleted, skip over it. 385 */ 386 if (error == -ENOENT) { 387 iter->rec_idx++; 388 goto retry; 389 } 390 goto fail; 391 } 392 393 if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) { 394 iter->rec_idx++; 395 xfs_irele(*ipp); 396 goto retry; 397 } 398 399 *chunk_rec = *irec; 400 return true; 401 402 fail: 403 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 404 return false; 405 } 406 407 static void 408 xfs_zone_gc_iter_advance( 409 struct xfs_zone_gc_iter *iter, 410 xfs_extlen_t count_fsb) 411 { 412 struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx]; 413 414 irec->rm_offset += count_fsb; 415 irec->rm_startblock += count_fsb; 416 irec->rm_blockcount -= count_fsb; 417 if (!irec->rm_blockcount) 418 iter->rec_idx++; 419 } 420 421 static struct xfs_rtgroup * 422 xfs_zone_gc_pick_victim_from( 423 struct xfs_mount *mp, 424 uint32_t bucket) 425 { 426 struct xfs_zone_info *zi = mp->m_zone_info; 427 uint32_t victim_used = U32_MAX; 428 struct xfs_rtgroup *victim_rtg = NULL; 429 uint32_t bit; 430 431 if (!zi->zi_used_bucket_entries[bucket]) 432 return NULL; 433 434 for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket], 435 mp->m_sb.sb_rgcount) { 436 struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit); 437 438 if (!rtg) 439 continue; 440 441 /* skip zones that are just waiting for a reset */ 442 if (rtg_rmap(rtg)->i_used_blocks == 0 || 443 rtg_rmap(rtg)->i_used_blocks >= victim_used) { 444 xfs_rtgroup_rele(rtg); 445 continue; 446 } 447 448 if (victim_rtg) 449 xfs_rtgroup_rele(victim_rtg); 450 victim_rtg = rtg; 451 victim_used = rtg_rmap(rtg)->i_used_blocks; 452 453 /* 454 * Any zone that is less than 1 percent used is fair game for 455 * instant reclaim. All of these zones are in the last 456 * bucket, so avoid the expensive division for the zones 457 * in the other buckets. 458 */ 459 if (bucket == 0 && 460 rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100) 461 break; 462 } 463 464 return victim_rtg; 465 } 466 467 /* 468 * Iterate through all zones marked as reclaimable and find a candidate to 469 * reclaim. 470 */ 471 static bool 472 xfs_zone_gc_select_victim( 473 struct xfs_zone_gc_data *data) 474 { 475 struct xfs_zone_gc_iter *iter = &data->iter; 476 struct xfs_mount *mp = data->mp; 477 struct xfs_zone_info *zi = mp->m_zone_info; 478 struct xfs_rtgroup *victim_rtg = NULL; 479 unsigned int bucket; 480 481 if (xfs_is_shutdown(mp)) 482 return false; 483 484 if (iter->victim_rtg) 485 return true; 486 487 /* 488 * Don't start new work if we are asked to stop or park. 489 */ 490 if (kthread_should_stop() || kthread_should_park()) 491 return false; 492 493 if (!xfs_zoned_need_gc(mp)) 494 return false; 495 496 spin_lock(&zi->zi_used_buckets_lock); 497 for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { 498 victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); 499 if (victim_rtg) 500 break; 501 } 502 spin_unlock(&zi->zi_used_buckets_lock); 503 504 if (!victim_rtg) 505 return false; 506 507 trace_xfs_zone_gc_select_victim(victim_rtg, bucket); 508 xfs_zone_gc_iter_init(iter, victim_rtg); 509 return true; 510 } 511 512 static struct xfs_open_zone * 513 xfs_zone_gc_steal_open( 514 struct xfs_zone_info *zi) 515 { 516 struct xfs_open_zone *oz, *found = NULL; 517 518 spin_lock(&zi->zi_open_zones_lock); 519 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { 520 if (!found || 521 oz->oz_write_pointer < found->oz_write_pointer) 522 found = oz; 523 } 524 525 if (found) { 526 found->oz_is_gc = true; 527 list_del_init(&found->oz_entry); 528 zi->zi_nr_open_zones--; 529 } 530 531 spin_unlock(&zi->zi_open_zones_lock); 532 return found; 533 } 534 535 static struct xfs_open_zone * 536 xfs_zone_gc_select_target( 537 struct xfs_mount *mp) 538 { 539 struct xfs_zone_info *zi = mp->m_zone_info; 540 struct xfs_open_zone *oz = zi->zi_open_gc_zone; 541 542 /* 543 * We need to wait for pending writes to finish. 544 */ 545 if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg)) 546 return NULL; 547 548 ASSERT(zi->zi_nr_open_zones <= 549 mp->m_max_open_zones - XFS_OPEN_GC_ZONES); 550 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 551 if (oz) 552 trace_xfs_zone_gc_target_opened(oz->oz_rtg); 553 spin_lock(&zi->zi_open_zones_lock); 554 zi->zi_open_gc_zone = oz; 555 spin_unlock(&zi->zi_open_zones_lock); 556 return oz; 557 } 558 559 /* 560 * Ensure we have a valid open zone to write the GC data to. 561 * 562 * If the current target zone has space keep writing to it, else first wait for 563 * all pending writes and then pick a new one. 564 */ 565 static struct xfs_open_zone * 566 xfs_zone_gc_ensure_target( 567 struct xfs_mount *mp) 568 { 569 struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; 570 571 if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) 572 return xfs_zone_gc_select_target(mp); 573 return oz; 574 } 575 576 static unsigned int 577 xfs_zone_gc_scratch_available( 578 struct xfs_zone_gc_data *data) 579 { 580 return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset; 581 } 582 583 static bool 584 xfs_zone_gc_space_available( 585 struct xfs_zone_gc_data *data) 586 { 587 struct xfs_open_zone *oz; 588 589 oz = xfs_zone_gc_ensure_target(data->mp); 590 if (!oz) 591 return false; 592 return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) && 593 xfs_zone_gc_scratch_available(data); 594 } 595 596 static void 597 xfs_zone_gc_end_io( 598 struct bio *bio) 599 { 600 struct xfs_gc_bio *chunk = 601 container_of(bio, struct xfs_gc_bio, bio); 602 struct xfs_zone_gc_data *data = chunk->data; 603 604 WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE); 605 wake_up_process(data->mp->m_zone_info->zi_gc_thread); 606 } 607 608 static struct xfs_open_zone * 609 xfs_zone_gc_alloc_blocks( 610 struct xfs_zone_gc_data *data, 611 xfs_extlen_t *count_fsb, 612 xfs_daddr_t *daddr, 613 bool *is_seq) 614 { 615 struct xfs_mount *mp = data->mp; 616 struct xfs_open_zone *oz; 617 618 oz = xfs_zone_gc_ensure_target(mp); 619 if (!oz) 620 return NULL; 621 622 *count_fsb = min(*count_fsb, 623 XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data))); 624 625 /* 626 * Directly allocate GC blocks from the reserved pool. 627 * 628 * If we'd take them from the normal pool we could be stealing blocks 629 * from a regular writer, which would then have to wait for GC and 630 * deadlock. 631 */ 632 spin_lock(&mp->m_sb_lock); 633 *count_fsb = min(*count_fsb, 634 rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer); 635 *count_fsb = min3(*count_fsb, 636 mp->m_free[XC_FREE_RTEXTENTS].res_avail, 637 mp->m_free[XC_FREE_RTAVAILABLE].res_avail); 638 mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb; 639 mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb; 640 spin_unlock(&mp->m_sb_lock); 641 642 if (!*count_fsb) 643 return NULL; 644 645 *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0); 646 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); 647 if (!*is_seq) 648 *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer); 649 oz->oz_write_pointer += *count_fsb; 650 atomic_inc(&oz->oz_ref); 651 return oz; 652 } 653 654 static bool 655 xfs_zone_gc_start_chunk( 656 struct xfs_zone_gc_data *data) 657 { 658 struct xfs_zone_gc_iter *iter = &data->iter; 659 struct xfs_mount *mp = data->mp; 660 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 661 struct xfs_open_zone *oz; 662 struct xfs_rmap_irec irec; 663 struct xfs_gc_bio *chunk; 664 struct xfs_inode *ip; 665 struct bio *bio; 666 xfs_daddr_t daddr; 667 bool is_seq; 668 669 if (xfs_is_shutdown(mp)) 670 return false; 671 672 if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip)) 673 return false; 674 oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, 675 &is_seq); 676 if (!oz) { 677 xfs_irele(ip); 678 return false; 679 } 680 681 bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set); 682 683 chunk = container_of(bio, struct xfs_gc_bio, bio); 684 chunk->ip = ip; 685 chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); 686 chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount); 687 chunk->old_startblock = 688 xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); 689 chunk->new_daddr = daddr; 690 chunk->is_seq = is_seq; 691 chunk->scratch = &data->scratch[data->scratch_idx]; 692 chunk->data = data; 693 chunk->oz = oz; 694 695 bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); 696 bio->bi_end_io = xfs_zone_gc_end_io; 697 bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len, 698 chunk->scratch->offset); 699 chunk->scratch->offset += chunk->len; 700 if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) { 701 data->scratch_idx = 702 (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH; 703 } 704 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 705 list_add_tail(&chunk->entry, &data->reading); 706 xfs_zone_gc_iter_advance(iter, irec.rm_blockcount); 707 708 submit_bio(bio); 709 return true; 710 } 711 712 static void 713 xfs_zone_gc_free_chunk( 714 struct xfs_gc_bio *chunk) 715 { 716 list_del(&chunk->entry); 717 xfs_open_zone_put(chunk->oz); 718 xfs_irele(chunk->ip); 719 bio_put(&chunk->bio); 720 } 721 722 static void 723 xfs_zone_gc_submit_write( 724 struct xfs_zone_gc_data *data, 725 struct xfs_gc_bio *chunk) 726 { 727 if (chunk->is_seq) { 728 chunk->bio.bi_opf &= ~REQ_OP_WRITE; 729 chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND; 730 } 731 chunk->bio.bi_iter.bi_sector = chunk->new_daddr; 732 chunk->bio.bi_end_io = xfs_zone_gc_end_io; 733 submit_bio(&chunk->bio); 734 } 735 736 static struct xfs_gc_bio * 737 xfs_zone_gc_split_write( 738 struct xfs_zone_gc_data *data, 739 struct xfs_gc_bio *chunk) 740 { 741 struct queue_limits *lim = 742 &bdev_get_queue(chunk->bio.bi_bdev)->limits; 743 struct xfs_gc_bio *split_chunk; 744 int split_sectors; 745 unsigned int split_len; 746 struct bio *split; 747 unsigned int nsegs; 748 749 if (!chunk->is_seq) 750 return NULL; 751 752 split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs, 753 lim->max_zone_append_sectors << SECTOR_SHIFT); 754 if (!split_sectors) 755 return NULL; 756 757 /* ensure the split chunk is still block size aligned */ 758 split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT, 759 data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT; 760 split_len = split_sectors << SECTOR_SHIFT; 761 762 split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set); 763 split_chunk = container_of(split, struct xfs_gc_bio, bio); 764 split_chunk->data = data; 765 ihold(VFS_I(chunk->ip)); 766 split_chunk->ip = chunk->ip; 767 split_chunk->is_seq = chunk->is_seq; 768 split_chunk->scratch = chunk->scratch; 769 split_chunk->offset = chunk->offset; 770 split_chunk->len = split_len; 771 split_chunk->old_startblock = chunk->old_startblock; 772 split_chunk->new_daddr = chunk->new_daddr; 773 split_chunk->oz = chunk->oz; 774 atomic_inc(&chunk->oz->oz_ref); 775 776 chunk->offset += split_len; 777 chunk->len -= split_len; 778 chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); 779 780 /* add right before the original chunk */ 781 WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW); 782 list_add_tail(&split_chunk->entry, &chunk->entry); 783 return split_chunk; 784 } 785 786 static void 787 xfs_zone_gc_write_chunk( 788 struct xfs_gc_bio *chunk) 789 { 790 struct xfs_zone_gc_data *data = chunk->data; 791 struct xfs_mount *mp = chunk->ip->i_mount; 792 unsigned int folio_offset = chunk->bio.bi_io_vec->bv_offset; 793 struct xfs_gc_bio *split_chunk; 794 795 if (chunk->bio.bi_status) 796 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 797 if (xfs_is_shutdown(mp)) { 798 xfs_zone_gc_free_chunk(chunk); 799 return; 800 } 801 802 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 803 list_move_tail(&chunk->entry, &data->writing); 804 805 bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE); 806 bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len, 807 folio_offset); 808 809 while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) 810 xfs_zone_gc_submit_write(data, split_chunk); 811 xfs_zone_gc_submit_write(data, chunk); 812 } 813 814 static void 815 xfs_zone_gc_finish_chunk( 816 struct xfs_gc_bio *chunk) 817 { 818 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 819 struct xfs_inode *ip = chunk->ip; 820 struct xfs_mount *mp = ip->i_mount; 821 int error; 822 823 if (chunk->bio.bi_status) 824 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 825 if (xfs_is_shutdown(mp)) { 826 xfs_zone_gc_free_chunk(chunk); 827 return; 828 } 829 830 chunk->scratch->freed += chunk->len; 831 if (chunk->scratch->freed == chunk->scratch->offset) { 832 chunk->scratch->offset = 0; 833 chunk->scratch->freed = 0; 834 } 835 836 /* 837 * Cycle through the iolock and wait for direct I/O and layouts to 838 * ensure no one is reading from the old mapping before it goes away. 839 * 840 * Note that xfs_zoned_end_io() below checks that no other writer raced 841 * with us to update the mapping by checking that the old startblock 842 * didn't change. 843 */ 844 xfs_ilock(ip, iolock); 845 error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP); 846 if (!error) 847 inode_dio_wait(VFS_I(ip)); 848 xfs_iunlock(ip, iolock); 849 if (error) 850 goto free; 851 852 if (chunk->is_seq) 853 chunk->new_daddr = chunk->bio.bi_iter.bi_sector; 854 error = xfs_zoned_end_io(ip, chunk->offset, chunk->len, 855 chunk->new_daddr, chunk->oz, chunk->old_startblock); 856 free: 857 if (error) 858 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 859 xfs_zone_gc_free_chunk(chunk); 860 } 861 862 static void 863 xfs_zone_gc_finish_reset( 864 struct xfs_gc_bio *chunk) 865 { 866 struct xfs_rtgroup *rtg = chunk->bio.bi_private; 867 struct xfs_mount *mp = rtg_mount(rtg); 868 struct xfs_zone_info *zi = mp->m_zone_info; 869 870 if (chunk->bio.bi_status) { 871 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 872 goto out; 873 } 874 875 xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); 876 atomic_inc(&zi->zi_nr_free_zones); 877 878 xfs_zoned_add_available(mp, rtg_blocks(rtg)); 879 880 wake_up_all(&zi->zi_zone_wait); 881 out: 882 list_del(&chunk->entry); 883 bio_put(&chunk->bio); 884 } 885 886 static bool 887 xfs_zone_gc_prepare_reset( 888 struct bio *bio, 889 struct xfs_rtgroup *rtg) 890 { 891 trace_xfs_zone_reset(rtg); 892 893 ASSERT(rtg_rmap(rtg)->i_used_blocks == 0); 894 bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); 895 if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { 896 if (!bdev_max_discard_sectors(bio->bi_bdev)) 897 return false; 898 bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC; 899 bio->bi_iter.bi_size = 900 XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg)); 901 } 902 903 return true; 904 } 905 906 int 907 xfs_zone_gc_reset_sync( 908 struct xfs_rtgroup *rtg) 909 { 910 int error = 0; 911 struct bio bio; 912 913 bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, 914 REQ_OP_ZONE_RESET); 915 if (xfs_zone_gc_prepare_reset(&bio, rtg)) 916 error = submit_bio_wait(&bio); 917 bio_uninit(&bio); 918 919 return error; 920 } 921 922 static void 923 xfs_zone_gc_reset_zones( 924 struct xfs_zone_gc_data *data, 925 struct xfs_group *reset_list) 926 { 927 struct xfs_group *next = reset_list; 928 929 if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) { 930 xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR); 931 return; 932 } 933 934 do { 935 struct xfs_rtgroup *rtg = to_rtg(next); 936 struct xfs_gc_bio *chunk; 937 struct bio *bio; 938 939 xfs_log_force_inode(rtg_rmap(rtg)); 940 941 next = rtg_group(rtg)->xg_next_reset; 942 rtg_group(rtg)->xg_next_reset = NULL; 943 944 bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, 945 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set); 946 bio->bi_private = rtg; 947 bio->bi_end_io = xfs_zone_gc_end_io; 948 949 chunk = container_of(bio, struct xfs_gc_bio, bio); 950 chunk->data = data; 951 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 952 list_add_tail(&chunk->entry, &data->resetting); 953 954 /* 955 * Also use the bio to drive the state machine when neither 956 * zone reset nor discard is supported to keep things simple. 957 */ 958 if (xfs_zone_gc_prepare_reset(bio, rtg)) 959 submit_bio(bio); 960 else 961 bio_endio(bio); 962 } while (next); 963 } 964 965 /* 966 * Handle the work to read and write data for GC and to reset the zones, 967 * including handling all completions. 968 * 969 * Note that the order of the chunks is preserved so that we don't undo the 970 * optimal order established by xfs_zone_gc_query(). 971 */ 972 static bool 973 xfs_zone_gc_handle_work( 974 struct xfs_zone_gc_data *data) 975 { 976 struct xfs_zone_info *zi = data->mp->m_zone_info; 977 struct xfs_gc_bio *chunk, *next; 978 struct xfs_group *reset_list; 979 struct blk_plug plug; 980 981 spin_lock(&zi->zi_reset_list_lock); 982 reset_list = zi->zi_reset_list; 983 zi->zi_reset_list = NULL; 984 spin_unlock(&zi->zi_reset_list_lock); 985 986 if (!xfs_zone_gc_select_victim(data) || 987 !xfs_zone_gc_space_available(data)) { 988 if (list_empty(&data->reading) && 989 list_empty(&data->writing) && 990 list_empty(&data->resetting) && 991 !reset_list) 992 return false; 993 } 994 995 __set_current_state(TASK_RUNNING); 996 try_to_freeze(); 997 998 if (reset_list) 999 xfs_zone_gc_reset_zones(data, reset_list); 1000 1001 list_for_each_entry_safe(chunk, next, &data->resetting, entry) { 1002 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1003 break; 1004 xfs_zone_gc_finish_reset(chunk); 1005 } 1006 1007 list_for_each_entry_safe(chunk, next, &data->writing, entry) { 1008 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1009 break; 1010 xfs_zone_gc_finish_chunk(chunk); 1011 } 1012 1013 blk_start_plug(&plug); 1014 list_for_each_entry_safe(chunk, next, &data->reading, entry) { 1015 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1016 break; 1017 xfs_zone_gc_write_chunk(chunk); 1018 } 1019 blk_finish_plug(&plug); 1020 1021 blk_start_plug(&plug); 1022 while (xfs_zone_gc_start_chunk(data)) 1023 ; 1024 blk_finish_plug(&plug); 1025 return true; 1026 } 1027 1028 /* 1029 * Note that the current GC algorithm would break reflinks and thus duplicate 1030 * data that was shared by multiple owners before. Because of that reflinks 1031 * are currently not supported on zoned file systems and can't be created or 1032 * mounted. 1033 */ 1034 static int 1035 xfs_zoned_gcd( 1036 void *private) 1037 { 1038 struct xfs_zone_gc_data *data = private; 1039 struct xfs_mount *mp = data->mp; 1040 struct xfs_zone_info *zi = mp->m_zone_info; 1041 unsigned int nofs_flag; 1042 1043 nofs_flag = memalloc_nofs_save(); 1044 set_freezable(); 1045 1046 for (;;) { 1047 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); 1048 xfs_set_zonegc_running(mp); 1049 if (xfs_zone_gc_handle_work(data)) 1050 continue; 1051 1052 if (list_empty(&data->reading) && 1053 list_empty(&data->writing) && 1054 list_empty(&data->resetting) && 1055 !zi->zi_reset_list) { 1056 xfs_clear_zonegc_running(mp); 1057 xfs_zoned_resv_wake_all(mp); 1058 1059 if (kthread_should_stop()) { 1060 __set_current_state(TASK_RUNNING); 1061 break; 1062 } 1063 1064 if (kthread_should_park()) { 1065 __set_current_state(TASK_RUNNING); 1066 kthread_parkme(); 1067 continue; 1068 } 1069 } 1070 1071 schedule(); 1072 } 1073 xfs_clear_zonegc_running(mp); 1074 1075 if (data->iter.victim_rtg) 1076 xfs_rtgroup_rele(data->iter.victim_rtg); 1077 1078 memalloc_nofs_restore(nofs_flag); 1079 xfs_zone_gc_data_free(data); 1080 return 0; 1081 } 1082 1083 void 1084 xfs_zone_gc_start( 1085 struct xfs_mount *mp) 1086 { 1087 if (xfs_has_zoned(mp)) 1088 kthread_unpark(mp->m_zone_info->zi_gc_thread); 1089 } 1090 1091 void 1092 xfs_zone_gc_stop( 1093 struct xfs_mount *mp) 1094 { 1095 if (xfs_has_zoned(mp)) 1096 kthread_park(mp->m_zone_info->zi_gc_thread); 1097 } 1098 1099 int 1100 xfs_zone_gc_mount( 1101 struct xfs_mount *mp) 1102 { 1103 struct xfs_zone_info *zi = mp->m_zone_info; 1104 struct xfs_zone_gc_data *data; 1105 struct xfs_open_zone *oz; 1106 int error; 1107 1108 /* 1109 * If there are no free zones available for GC, pick the open zone with 1110 * the least used space to GC into. This should only happen after an 1111 * unclean shutdown near ENOSPC while GC was ongoing. 1112 * 1113 * We also need to do this for the first gc zone allocation if we 1114 * unmounted while at the open limit. 1115 */ 1116 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || 1117 zi->zi_nr_open_zones == mp->m_max_open_zones) 1118 oz = xfs_zone_gc_steal_open(zi); 1119 else 1120 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 1121 if (!oz) { 1122 xfs_warn(mp, "unable to allocate a zone for gc"); 1123 error = -EIO; 1124 goto out; 1125 } 1126 1127 trace_xfs_zone_gc_target_opened(oz->oz_rtg); 1128 zi->zi_open_gc_zone = oz; 1129 1130 data = xfs_zone_gc_data_alloc(mp); 1131 if (!data) { 1132 error = -ENOMEM; 1133 goto out_put_gc_zone; 1134 } 1135 1136 mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, 1137 "xfs-zone-gc/%s", mp->m_super->s_id); 1138 if (IS_ERR(mp->m_zone_info->zi_gc_thread)) { 1139 xfs_warn(mp, "unable to create zone gc thread"); 1140 error = PTR_ERR(mp->m_zone_info->zi_gc_thread); 1141 goto out_free_gc_data; 1142 } 1143 1144 /* xfs_zone_gc_start will unpark for rw mounts */ 1145 kthread_park(mp->m_zone_info->zi_gc_thread); 1146 return 0; 1147 1148 out_free_gc_data: 1149 kfree(data); 1150 out_put_gc_zone: 1151 xfs_open_zone_put(zi->zi_open_gc_zone); 1152 out: 1153 return error; 1154 } 1155 1156 void 1157 xfs_zone_gc_unmount( 1158 struct xfs_mount *mp) 1159 { 1160 struct xfs_zone_info *zi = mp->m_zone_info; 1161 1162 kthread_stop(zi->zi_gc_thread); 1163 if (zi->zi_open_gc_zone) 1164 xfs_open_zone_put(zi->zi_open_gc_zone); 1165 } 1166