1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2023-2025 Christoph Hellwig. 4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. 5 */ 6 #include "xfs_platform.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_inode.h" 13 #include "xfs_btree.h" 14 #include "xfs_trans.h" 15 #include "xfs_icache.h" 16 #include "xfs_rmap.h" 17 #include "xfs_rtbitmap.h" 18 #include "xfs_rtrmap_btree.h" 19 #include "xfs_errortag.h" 20 #include "xfs_error.h" 21 #include "xfs_zone_alloc.h" 22 #include "xfs_zone_priv.h" 23 #include "xfs_zones.h" 24 #include "xfs_trace.h" 25 26 /* 27 * Implement Garbage Collection (GC) of partially used zoned. 28 * 29 * To support the purely sequential writes in each zone, zoned XFS needs to be 30 * able to move data remaining in a zone out of it to reset the zone to prepare 31 * for writing to it again. 32 * 33 * This is done by the GC thread implemented in this file. To support that a 34 * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to 35 * write the garbage collected data into. 36 * 37 * Whenever the available space is below the chosen threshold, the GC thread 38 * looks for potential non-empty but not fully used zones that are worth 39 * reclaiming. Once found the rmap for the victim zone is queried, and after 40 * a bit of sorting to reduce fragmentation, the still live extents are read 41 * into memory and written to the GC target zone, and the bmap btree of the 42 * files is updated to point to the new location. To avoid taking the IOLOCK 43 * and MMAPLOCK for the entire GC process and thus affecting the latency of 44 * user reads and writes to the files, the GC writes are speculative and the 45 * I/O completion checks that no other writes happened for the affected regions 46 * before remapping. 47 * 48 * Once a zone does not contain any valid data, be that through GC or user 49 * block removal, it is queued for for a zone reset. The reset operation 50 * carefully ensures that the RT device cache is flushed and all transactions 51 * referencing the rmap have been committed to disk. 52 */ 53 54 /* 55 * Size of each GC scratch allocation, and the number of buffers. 56 */ 57 #define XFS_GC_BUF_SIZE SZ_1M 58 #define XFS_GC_NR_BUFS 2 59 static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS); 60 61 /* 62 * Chunk that is read and written for each GC operation. 63 * 64 * Note that for writes to actual zoned devices, the chunk can be split when 65 * reaching the hardware limit. 66 */ 67 struct xfs_gc_bio { 68 struct xfs_zone_gc_data *data; 69 70 /* 71 * Entry into the reading/writing/resetting list. Only accessed from 72 * the GC thread, so no locking needed. 73 */ 74 struct list_head entry; 75 76 /* 77 * State of this gc_bio. Done means the current I/O completed. 78 * Set from the bio end I/O handler, read from the GC thread. 79 */ 80 enum { 81 XFS_GC_BIO_NEW, 82 XFS_GC_BIO_DONE, 83 } state; 84 85 /* 86 * Pointer to the inode and byte range in the inode that this 87 * GC chunk is operating on. 88 */ 89 struct xfs_inode *ip; 90 loff_t offset; 91 unsigned int len; 92 93 /* 94 * Existing startblock (in the zone to be freed) and newly assigned 95 * daddr in the zone GCed into. 96 */ 97 xfs_fsblock_t old_startblock; 98 xfs_daddr_t new_daddr; 99 100 /* Are we writing to a sequential write required zone? */ 101 bool is_seq; 102 103 /* Open Zone being written to */ 104 struct xfs_open_zone *oz; 105 106 struct xfs_rtgroup *victim_rtg; 107 108 /* Bio used for reads and writes, including the bvec used by it */ 109 struct bio bio; /* must be last */ 110 }; 111 112 #define XFS_ZONE_GC_RECS 1024 113 114 /* iterator, needs to be reinitialized for each victim zone */ 115 struct xfs_zone_gc_iter { 116 struct xfs_rtgroup *victim_rtg; 117 unsigned int rec_count; 118 unsigned int rec_idx; 119 xfs_agblock_t next_startblock; 120 struct xfs_rmap_irec *recs; 121 }; 122 123 /* 124 * Per-mount GC state. 125 */ 126 struct xfs_zone_gc_data { 127 struct xfs_mount *mp; 128 129 /* bioset used to allocate the gc_bios */ 130 struct bio_set bio_set; 131 132 /* 133 * Scratchpad to buffer GC data, organized as a ring buffer over 134 * discontiguous folios. scratch_head is where the buffer is filled, 135 * scratch_tail tracks the buffer space freed, and scratch_available 136 * counts the space available in the ring buffer between the head and 137 * the tail. 138 */ 139 struct folio *scratch_folios[XFS_GC_NR_BUFS]; 140 unsigned int scratch_size; 141 unsigned int scratch_available; 142 unsigned int scratch_head; 143 unsigned int scratch_tail; 144 145 /* 146 * List of bios currently being read, written and reset. 147 * These lists are only accessed by the GC thread itself, and must only 148 * be processed in order. 149 */ 150 struct list_head reading; 151 struct list_head writing; 152 struct list_head resetting; 153 154 /* 155 * Iterator for the victim zone. 156 */ 157 struct xfs_zone_gc_iter iter; 158 }; 159 160 /* 161 * We aim to keep enough zones free in stock to fully use the open zone limit 162 * for data placement purposes. Additionally, the m_zonegc_low_space tunable 163 * can be set to make sure a fraction of the unused blocks are available for 164 * writing. 165 */ 166 bool 167 xfs_zoned_need_gc( 168 struct xfs_mount *mp) 169 { 170 s64 available, free, threshold; 171 s32 remainder; 172 173 if (!xfs_zoned_have_reclaimable(mp->m_zone_info)) 174 return false; 175 176 available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); 177 178 if (available < 179 xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) 180 return true; 181 182 free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); 183 184 threshold = div_s64_rem(free, 100, &remainder); 185 threshold = threshold * mp->m_zonegc_low_space + 186 remainder * div_s64(mp->m_zonegc_low_space, 100); 187 188 if (available < threshold) 189 return true; 190 191 return false; 192 } 193 194 static struct xfs_zone_gc_data * 195 xfs_zone_gc_data_alloc( 196 struct xfs_mount *mp) 197 { 198 struct xfs_zone_gc_data *data; 199 int i; 200 201 data = kzalloc_obj(*data); 202 if (!data) 203 return NULL; 204 data->iter.recs = kzalloc_objs(*data->iter.recs, XFS_ZONE_GC_RECS); 205 if (!data->iter.recs) 206 goto out_free_data; 207 208 if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio), 209 BIOSET_NEED_BVECS)) 210 goto out_free_recs; 211 for (i = 0; i < XFS_GC_NR_BUFS; i++) { 212 data->scratch_folios[i] = 213 folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE)); 214 if (!data->scratch_folios[i]) 215 goto out_free_scratch; 216 } 217 data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS; 218 data->scratch_available = data->scratch_size; 219 INIT_LIST_HEAD(&data->reading); 220 INIT_LIST_HEAD(&data->writing); 221 INIT_LIST_HEAD(&data->resetting); 222 data->mp = mp; 223 return data; 224 225 out_free_scratch: 226 while (--i >= 0) 227 folio_put(data->scratch_folios[i]); 228 bioset_exit(&data->bio_set); 229 out_free_recs: 230 kfree(data->iter.recs); 231 out_free_data: 232 kfree(data); 233 return NULL; 234 } 235 236 static void 237 xfs_zone_gc_data_free( 238 struct xfs_zone_gc_data *data) 239 { 240 int i; 241 242 for (i = 0; i < XFS_GC_NR_BUFS; i++) 243 folio_put(data->scratch_folios[i]); 244 bioset_exit(&data->bio_set); 245 kfree(data->iter.recs); 246 kfree(data); 247 } 248 249 static void 250 xfs_zone_gc_iter_init( 251 struct xfs_zone_gc_iter *iter, 252 struct xfs_rtgroup *victim_rtg) 253 254 { 255 iter->next_startblock = 0; 256 iter->rec_count = 0; 257 iter->rec_idx = 0; 258 iter->victim_rtg = victim_rtg; 259 atomic_inc(&victim_rtg->rtg_gccount); 260 } 261 262 /* 263 * Query the rmap of the victim zone to gather the records to evacuate. 264 */ 265 static int 266 xfs_zone_gc_query_cb( 267 struct xfs_btree_cur *cur, 268 const struct xfs_rmap_irec *irec, 269 void *private) 270 { 271 struct xfs_zone_gc_iter *iter = private; 272 273 ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)); 274 ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner)); 275 ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))); 276 277 iter->recs[iter->rec_count] = *irec; 278 if (++iter->rec_count == XFS_ZONE_GC_RECS) { 279 iter->next_startblock = 280 irec->rm_startblock + irec->rm_blockcount; 281 return 1; 282 } 283 return 0; 284 } 285 286 static int 287 xfs_zone_gc_rmap_rec_cmp( 288 const void *a, 289 const void *b) 290 { 291 const struct xfs_rmap_irec *reca = a; 292 const struct xfs_rmap_irec *recb = b; 293 int diff; 294 295 diff = cmp_int(reca->rm_owner, recb->rm_owner); 296 if (diff) 297 return diff; 298 return cmp_int(reca->rm_offset, recb->rm_offset); 299 } 300 301 static int 302 xfs_zone_gc_query( 303 struct xfs_mount *mp, 304 struct xfs_zone_gc_iter *iter) 305 { 306 struct xfs_rtgroup *rtg = iter->victim_rtg; 307 struct xfs_rmap_irec ri_low = { }; 308 struct xfs_rmap_irec ri_high; 309 struct xfs_btree_cur *cur; 310 struct xfs_trans *tp; 311 int error; 312 313 ASSERT(iter->next_startblock <= rtg_blocks(rtg)); 314 if (iter->next_startblock == rtg_blocks(rtg)) 315 goto done; 316 317 ASSERT(iter->next_startblock < rtg_blocks(rtg)); 318 ri_low.rm_startblock = iter->next_startblock; 319 memset(&ri_high, 0xFF, sizeof(ri_high)); 320 321 iter->rec_idx = 0; 322 iter->rec_count = 0; 323 324 tp = xfs_trans_alloc_empty(mp); 325 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 326 cur = xfs_rtrmapbt_init_cursor(tp, rtg); 327 error = xfs_rmap_query_range(cur, &ri_low, &ri_high, 328 xfs_zone_gc_query_cb, iter); 329 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 330 xfs_btree_del_cursor(cur, error < 0 ? error : 0); 331 xfs_trans_cancel(tp); 332 333 if (error < 0) 334 return error; 335 336 /* 337 * Sort the rmap records by inode number and increasing offset to 338 * defragment the mappings. 339 * 340 * This could be further enhanced by an even bigger look ahead window, 341 * but that's better left until we have better detection of changes to 342 * inode mapping to avoid the potential of GCing already dead data. 343 */ 344 sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]), 345 xfs_zone_gc_rmap_rec_cmp, NULL); 346 347 if (error == 0) { 348 /* 349 * We finished iterating through the zone. 350 */ 351 iter->next_startblock = rtg_blocks(rtg); 352 if (iter->rec_count == 0) 353 goto done; 354 } 355 356 return 0; 357 done: 358 atomic_dec(&iter->victim_rtg->rtg_gccount); 359 xfs_rtgroup_rele(iter->victim_rtg); 360 iter->victim_rtg = NULL; 361 return 0; 362 } 363 364 static bool 365 xfs_zone_gc_iter_next( 366 struct xfs_mount *mp, 367 struct xfs_zone_gc_iter *iter, 368 struct xfs_rmap_irec *chunk_rec, 369 struct xfs_inode **ipp) 370 { 371 struct xfs_rmap_irec *irec; 372 int error; 373 374 if (!iter->victim_rtg) 375 return false; 376 377 retry: 378 if (iter->rec_idx == iter->rec_count) { 379 error = xfs_zone_gc_query(mp, iter); 380 if (error) 381 goto fail; 382 if (!iter->victim_rtg) 383 return false; 384 } 385 386 irec = &iter->recs[iter->rec_idx]; 387 error = xfs_iget(mp, NULL, irec->rm_owner, 388 XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp); 389 if (error) { 390 /* 391 * If the inode was already deleted, skip over it. 392 */ 393 if (error == -ENOENT) { 394 iter->rec_idx++; 395 goto retry; 396 } 397 goto fail; 398 } 399 400 if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) { 401 iter->rec_idx++; 402 xfs_irele(*ipp); 403 goto retry; 404 } 405 406 *chunk_rec = *irec; 407 return true; 408 409 fail: 410 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 411 return false; 412 } 413 414 static void 415 xfs_zone_gc_iter_advance( 416 struct xfs_zone_gc_iter *iter, 417 xfs_extlen_t count_fsb) 418 { 419 struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx]; 420 421 irec->rm_offset += count_fsb; 422 irec->rm_startblock += count_fsb; 423 irec->rm_blockcount -= count_fsb; 424 if (!irec->rm_blockcount) 425 iter->rec_idx++; 426 } 427 428 static struct xfs_rtgroup * 429 xfs_zone_gc_pick_victim_from( 430 struct xfs_mount *mp, 431 uint32_t bucket) 432 { 433 struct xfs_zone_info *zi = mp->m_zone_info; 434 uint32_t victim_used = U32_MAX; 435 struct xfs_rtgroup *victim_rtg = NULL; 436 uint32_t bit; 437 438 if (!zi->zi_used_bucket_entries[bucket]) 439 return NULL; 440 441 for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket], 442 mp->m_sb.sb_rgcount) { 443 struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit); 444 445 if (!rtg) 446 continue; 447 448 /* 449 * If the zone is already undergoing GC, don't pick it again. 450 * 451 * This prevents us from picking one of the zones for which we 452 * already submitted GC I/O, but for which the remapping hasn't 453 * concluded yet. This won't cause data corruption, but 454 * increases write amplification and slows down GC, so this is 455 * a bad thing. 456 */ 457 if (atomic_read(&rtg->rtg_gccount)) { 458 xfs_rtgroup_rele(rtg); 459 continue; 460 } 461 462 /* skip zones that are just waiting for a reset */ 463 if (rtg_rmap(rtg)->i_used_blocks == 0 || 464 rtg_rmap(rtg)->i_used_blocks >= victim_used) { 465 xfs_rtgroup_rele(rtg); 466 continue; 467 } 468 469 if (victim_rtg) 470 xfs_rtgroup_rele(victim_rtg); 471 victim_rtg = rtg; 472 victim_used = rtg_rmap(rtg)->i_used_blocks; 473 474 /* 475 * Any zone that is less than 1 percent used is fair game for 476 * instant reclaim. All of these zones are in the last 477 * bucket, so avoid the expensive division for the zones 478 * in the other buckets. 479 */ 480 if (bucket == 0 && 481 rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100) 482 break; 483 } 484 485 return victim_rtg; 486 } 487 488 /* 489 * Iterate through all zones marked as reclaimable and find a candidate to 490 * reclaim. 491 */ 492 static bool 493 xfs_zone_gc_select_victim( 494 struct xfs_zone_gc_data *data) 495 { 496 struct xfs_zone_gc_iter *iter = &data->iter; 497 struct xfs_mount *mp = data->mp; 498 struct xfs_zone_info *zi = mp->m_zone_info; 499 struct xfs_rtgroup *victim_rtg = NULL; 500 unsigned int bucket; 501 502 spin_lock(&zi->zi_used_buckets_lock); 503 for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { 504 victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); 505 if (victim_rtg) 506 break; 507 } 508 spin_unlock(&zi->zi_used_buckets_lock); 509 510 if (!victim_rtg) 511 return false; 512 513 trace_xfs_zone_gc_select_victim(victim_rtg, bucket); 514 xfs_zone_gc_iter_init(iter, victim_rtg); 515 return true; 516 } 517 518 static struct xfs_open_zone * 519 xfs_zone_gc_steal_open( 520 struct xfs_zone_info *zi) 521 { 522 struct xfs_open_zone *oz, *found = NULL; 523 524 spin_lock(&zi->zi_open_zones_lock); 525 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { 526 if (!found || oz->oz_allocated < found->oz_allocated) 527 found = oz; 528 } 529 530 if (found) { 531 found->oz_is_gc = true; 532 list_del_init(&found->oz_entry); 533 zi->zi_nr_open_zones--; 534 } 535 536 spin_unlock(&zi->zi_open_zones_lock); 537 return found; 538 } 539 540 static struct xfs_open_zone * 541 xfs_zone_gc_select_target( 542 struct xfs_mount *mp) 543 { 544 struct xfs_zone_info *zi = mp->m_zone_info; 545 struct xfs_open_zone *oz = zi->zi_open_gc_zone; 546 547 /* 548 * We need to wait for pending writes to finish. 549 */ 550 if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg)) 551 return NULL; 552 553 ASSERT(zi->zi_nr_open_zones <= 554 mp->m_max_open_zones - XFS_OPEN_GC_ZONES); 555 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 556 if (oz) 557 trace_xfs_zone_gc_target_opened(oz->oz_rtg); 558 spin_lock(&zi->zi_open_zones_lock); 559 zi->zi_open_gc_zone = oz; 560 spin_unlock(&zi->zi_open_zones_lock); 561 return oz; 562 } 563 564 /* 565 * Ensure we have a valid open zone to write the GC data to. 566 * 567 * If the current target zone has space keep writing to it, else first wait for 568 * all pending writes and then pick a new one. 569 */ 570 static struct xfs_open_zone * 571 xfs_zone_gc_ensure_target( 572 struct xfs_mount *mp) 573 { 574 struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; 575 576 if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg)) 577 return xfs_zone_gc_select_target(mp); 578 return oz; 579 } 580 581 static void 582 xfs_zone_gc_end_io( 583 struct bio *bio) 584 { 585 struct xfs_gc_bio *chunk = 586 container_of(bio, struct xfs_gc_bio, bio); 587 struct xfs_zone_gc_data *data = chunk->data; 588 589 WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE); 590 wake_up_process(data->mp->m_zone_info->zi_gc_thread); 591 } 592 593 static struct xfs_open_zone * 594 xfs_zone_gc_alloc_blocks( 595 struct xfs_zone_gc_data *data, 596 xfs_extlen_t *count_fsb, 597 xfs_daddr_t *daddr, 598 bool *is_seq) 599 { 600 struct xfs_mount *mp = data->mp; 601 struct xfs_open_zone *oz; 602 603 oz = xfs_zone_gc_ensure_target(mp); 604 if (!oz) 605 return NULL; 606 607 *count_fsb = min(*count_fsb, XFS_B_TO_FSB(mp, data->scratch_available)); 608 609 /* 610 * Directly allocate GC blocks from the reserved pool. 611 * 612 * If we'd take them from the normal pool we could be stealing blocks 613 * from a regular writer, which would then have to wait for GC and 614 * deadlock. 615 */ 616 spin_lock(&mp->m_sb_lock); 617 *count_fsb = min(*count_fsb, 618 rtg_blocks(oz->oz_rtg) - oz->oz_allocated); 619 *count_fsb = min3(*count_fsb, 620 mp->m_free[XC_FREE_RTEXTENTS].res_avail, 621 mp->m_free[XC_FREE_RTAVAILABLE].res_avail); 622 mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb; 623 mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb; 624 spin_unlock(&mp->m_sb_lock); 625 626 if (!*count_fsb) 627 return NULL; 628 629 *daddr = xfs_gbno_to_daddr(rtg_group(oz->oz_rtg), 0); 630 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); 631 if (!*is_seq) 632 *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated); 633 oz->oz_allocated += *count_fsb; 634 atomic_inc(&oz->oz_ref); 635 return oz; 636 } 637 638 static void 639 xfs_zone_gc_add_data( 640 struct xfs_gc_bio *chunk) 641 { 642 struct xfs_zone_gc_data *data = chunk->data; 643 unsigned int len = chunk->len; 644 unsigned int off = data->scratch_head; 645 646 do { 647 unsigned int this_off = off % XFS_GC_BUF_SIZE; 648 unsigned int this_len = min(len, XFS_GC_BUF_SIZE - this_off); 649 650 bio_add_folio_nofail(&chunk->bio, 651 data->scratch_folios[off / XFS_GC_BUF_SIZE], 652 this_len, this_off); 653 len -= this_len; 654 off += this_len; 655 if (off == data->scratch_size) 656 off = 0; 657 } while (len); 658 } 659 660 static bool 661 xfs_zone_gc_start_chunk( 662 struct xfs_zone_gc_data *data) 663 { 664 struct xfs_zone_gc_iter *iter = &data->iter; 665 struct xfs_mount *mp = data->mp; 666 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 667 struct xfs_open_zone *oz; 668 struct xfs_rmap_irec irec; 669 struct xfs_gc_bio *chunk; 670 struct xfs_inode *ip; 671 struct bio *bio; 672 xfs_daddr_t daddr; 673 unsigned int len; 674 bool is_seq; 675 676 if (xfs_is_shutdown(mp)) 677 return false; 678 679 if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip)) 680 return false; 681 oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, 682 &is_seq); 683 if (!oz) { 684 xfs_irele(ip); 685 return false; 686 } 687 688 len = XFS_FSB_TO_B(mp, irec.rm_blockcount); 689 bio = bio_alloc_bioset(bdev, 690 min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS), 691 REQ_OP_READ, GFP_NOFS, &data->bio_set); 692 693 chunk = container_of(bio, struct xfs_gc_bio, bio); 694 chunk->ip = ip; 695 chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); 696 chunk->len = len; 697 chunk->old_startblock = 698 xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); 699 chunk->new_daddr = daddr; 700 chunk->is_seq = is_seq; 701 chunk->data = data; 702 chunk->oz = oz; 703 chunk->victim_rtg = iter->victim_rtg; 704 atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref); 705 atomic_inc(&chunk->victim_rtg->rtg_gccount); 706 707 bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); 708 bio->bi_end_io = xfs_zone_gc_end_io; 709 xfs_zone_gc_add_data(chunk); 710 data->scratch_head = (data->scratch_head + len) % data->scratch_size; 711 data->scratch_available -= len; 712 713 XFS_STATS_INC(mp, xs_gc_read_calls); 714 715 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 716 list_add_tail(&chunk->entry, &data->reading); 717 xfs_zone_gc_iter_advance(iter, irec.rm_blockcount); 718 719 submit_bio(bio); 720 return true; 721 } 722 723 static void 724 xfs_zone_gc_free_chunk( 725 struct xfs_gc_bio *chunk) 726 { 727 atomic_dec(&chunk->victim_rtg->rtg_gccount); 728 xfs_rtgroup_rele(chunk->victim_rtg); 729 list_del(&chunk->entry); 730 xfs_open_zone_put(chunk->oz); 731 xfs_irele(chunk->ip); 732 bio_put(&chunk->bio); 733 } 734 735 static void 736 xfs_zone_gc_submit_write( 737 struct xfs_zone_gc_data *data, 738 struct xfs_gc_bio *chunk) 739 { 740 if (chunk->is_seq) { 741 chunk->bio.bi_opf &= ~REQ_OP_WRITE; 742 chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND; 743 } 744 chunk->bio.bi_iter.bi_sector = chunk->new_daddr; 745 chunk->bio.bi_end_io = xfs_zone_gc_end_io; 746 submit_bio(&chunk->bio); 747 } 748 749 static struct xfs_gc_bio * 750 xfs_zone_gc_split_write( 751 struct xfs_zone_gc_data *data, 752 struct xfs_gc_bio *chunk) 753 { 754 struct queue_limits *lim = 755 &bdev_get_queue(chunk->bio.bi_bdev)->limits; 756 struct xfs_gc_bio *split_chunk; 757 int split_sectors; 758 unsigned int split_len; 759 struct bio *split; 760 unsigned int nsegs; 761 762 if (!chunk->is_seq) 763 return NULL; 764 765 split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs, 766 lim->max_zone_append_sectors << SECTOR_SHIFT); 767 if (!split_sectors) 768 return NULL; 769 770 /* ensure the split chunk is still block size aligned */ 771 split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT, 772 data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT; 773 split_len = split_sectors << SECTOR_SHIFT; 774 775 split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set); 776 split_chunk = container_of(split, struct xfs_gc_bio, bio); 777 split_chunk->data = data; 778 ihold(VFS_I(chunk->ip)); 779 split_chunk->ip = chunk->ip; 780 split_chunk->is_seq = chunk->is_seq; 781 split_chunk->offset = chunk->offset; 782 split_chunk->len = split_len; 783 split_chunk->old_startblock = chunk->old_startblock; 784 split_chunk->new_daddr = chunk->new_daddr; 785 split_chunk->oz = chunk->oz; 786 atomic_inc(&chunk->oz->oz_ref); 787 788 split_chunk->victim_rtg = chunk->victim_rtg; 789 atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref); 790 atomic_inc(&chunk->victim_rtg->rtg_gccount); 791 792 chunk->offset += split_len; 793 chunk->len -= split_len; 794 chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); 795 796 /* add right before the original chunk */ 797 WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW); 798 list_add_tail(&split_chunk->entry, &chunk->entry); 799 return split_chunk; 800 } 801 802 static void 803 xfs_zone_gc_write_chunk( 804 struct xfs_gc_bio *chunk) 805 { 806 struct xfs_zone_gc_data *data = chunk->data; 807 struct xfs_mount *mp = chunk->ip->i_mount; 808 struct xfs_gc_bio *split_chunk; 809 810 if (chunk->bio.bi_status) 811 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 812 if (xfs_is_shutdown(mp)) { 813 xfs_zone_gc_free_chunk(chunk); 814 return; 815 } 816 817 XFS_STATS_INC(mp, xs_gc_write_calls); 818 XFS_STATS_ADD(mp, xs_gc_bytes, chunk->len); 819 820 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 821 list_move_tail(&chunk->entry, &data->writing); 822 823 bio_reuse(&chunk->bio, REQ_OP_WRITE); 824 while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) 825 xfs_zone_gc_submit_write(data, split_chunk); 826 xfs_zone_gc_submit_write(data, chunk); 827 } 828 829 static void 830 xfs_zone_gc_finish_chunk( 831 struct xfs_gc_bio *chunk) 832 { 833 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 834 struct xfs_zone_gc_data *data = chunk->data; 835 struct xfs_inode *ip = chunk->ip; 836 struct xfs_mount *mp = ip->i_mount; 837 int error; 838 839 if (chunk->bio.bi_status) 840 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 841 if (xfs_is_shutdown(mp)) { 842 xfs_zone_gc_free_chunk(chunk); 843 return; 844 } 845 846 data->scratch_tail = 847 (data->scratch_tail + chunk->len) % data->scratch_size; 848 data->scratch_available += chunk->len; 849 850 /* 851 * Cycle through the iolock and wait for direct I/O and layouts to 852 * ensure no one is reading from the old mapping before it goes away. 853 * 854 * Note that xfs_zoned_end_io() below checks that no other writer raced 855 * with us to update the mapping by checking that the old startblock 856 * didn't change. 857 */ 858 xfs_ilock(ip, iolock); 859 error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP); 860 if (!error) 861 inode_dio_wait(VFS_I(ip)); 862 xfs_iunlock(ip, iolock); 863 if (error) 864 goto free; 865 866 if (chunk->is_seq) 867 chunk->new_daddr = chunk->bio.bi_iter.bi_sector; 868 error = xfs_zoned_end_io(ip, chunk->offset, chunk->len, 869 chunk->new_daddr, chunk->oz, chunk->old_startblock); 870 free: 871 if (error) 872 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 873 xfs_zone_gc_free_chunk(chunk); 874 } 875 876 static void 877 xfs_zone_gc_finish_reset( 878 struct xfs_gc_bio *chunk) 879 { 880 struct xfs_rtgroup *rtg = chunk->bio.bi_private; 881 struct xfs_mount *mp = rtg_mount(rtg); 882 struct xfs_zone_info *zi = mp->m_zone_info; 883 884 if (chunk->bio.bi_status) { 885 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 886 goto out; 887 } 888 889 xfs_group_set_mark(rtg_group(rtg), XFS_RTG_FREE); 890 atomic_inc(&zi->zi_nr_free_zones); 891 892 xfs_zoned_add_available(mp, rtg_blocks(rtg)); 893 894 wake_up_all(&zi->zi_zone_wait); 895 out: 896 list_del(&chunk->entry); 897 bio_put(&chunk->bio); 898 } 899 900 static void 901 xfs_submit_zone_reset_bio( 902 struct xfs_rtgroup *rtg, 903 struct bio *bio) 904 { 905 struct xfs_mount *mp = rtg_mount(rtg); 906 907 trace_xfs_zone_reset(rtg); 908 909 ASSERT(rtg_rmap(rtg)->i_used_blocks == 0); 910 911 if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ZONE_RESET)) { 912 bio_io_error(bio); 913 return; 914 } 915 916 XFS_STATS_INC(mp, xs_gc_zone_reset_calls); 917 918 bio->bi_iter.bi_sector = xfs_gbno_to_daddr(rtg_group(rtg), 0); 919 if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { 920 /* 921 * Also use the bio to drive the state machine when neither 922 * zone reset nor discard is supported to keep things simple. 923 */ 924 if (!bdev_max_discard_sectors(bio->bi_bdev)) { 925 bio_endio(bio); 926 return; 927 } 928 bio->bi_opf &= ~REQ_OP_ZONE_RESET; 929 bio->bi_opf |= REQ_OP_DISCARD; 930 bio->bi_iter.bi_size = XFS_FSB_TO_B(mp, rtg_blocks(rtg)); 931 } 932 933 submit_bio(bio); 934 } 935 936 static void xfs_bio_wait_endio(struct bio *bio) 937 { 938 complete(bio->bi_private); 939 } 940 941 int 942 xfs_zone_gc_reset_sync( 943 struct xfs_rtgroup *rtg) 944 { 945 DECLARE_COMPLETION_ONSTACK(done); 946 struct bio bio; 947 int error; 948 949 bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, 950 REQ_OP_ZONE_RESET | REQ_SYNC); 951 bio.bi_private = &done; 952 bio.bi_end_io = xfs_bio_wait_endio; 953 xfs_submit_zone_reset_bio(rtg, &bio); 954 wait_for_completion_io(&done); 955 956 error = blk_status_to_errno(bio.bi_status); 957 bio_uninit(&bio); 958 return error; 959 } 960 961 static void 962 xfs_zone_gc_reset_zones( 963 struct xfs_zone_gc_data *data, 964 struct xfs_group *reset_list) 965 { 966 struct xfs_group *next = reset_list; 967 968 if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) { 969 xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR); 970 return; 971 } 972 973 do { 974 struct xfs_rtgroup *rtg = to_rtg(next); 975 struct xfs_gc_bio *chunk; 976 struct bio *bio; 977 978 xfs_log_force_inode(rtg_rmap(rtg)); 979 980 next = rtg_group(rtg)->xg_next_reset; 981 rtg_group(rtg)->xg_next_reset = NULL; 982 983 bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, 984 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set); 985 bio->bi_private = rtg; 986 bio->bi_end_io = xfs_zone_gc_end_io; 987 988 chunk = container_of(bio, struct xfs_gc_bio, bio); 989 chunk->data = data; 990 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 991 list_add_tail(&chunk->entry, &data->resetting); 992 xfs_submit_zone_reset_bio(rtg, bio); 993 } while (next); 994 } 995 996 static bool 997 xfs_zone_gc_should_start_new_work( 998 struct xfs_zone_gc_data *data) 999 { 1000 struct xfs_open_zone *oz; 1001 1002 if (xfs_is_shutdown(data->mp)) 1003 return false; 1004 if (!data->scratch_available) 1005 return false; 1006 1007 oz = xfs_zone_gc_ensure_target(data->mp); 1008 if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg)) 1009 return false; 1010 1011 if (!data->iter.victim_rtg) { 1012 if (kthread_should_stop() || kthread_should_park()) 1013 return false; 1014 if (!xfs_zoned_need_gc(data->mp)) 1015 return false; 1016 if (!xfs_zone_gc_select_victim(data)) 1017 return false; 1018 } 1019 1020 return true; 1021 } 1022 1023 /* 1024 * Handle the work to read and write data for GC and to reset the zones, 1025 * including handling all completions. 1026 * 1027 * Note that the order of the chunks is preserved so that we don't undo the 1028 * optimal order established by xfs_zone_gc_query(). 1029 */ 1030 static void 1031 xfs_zone_gc_handle_work( 1032 struct xfs_zone_gc_data *data) 1033 { 1034 struct xfs_zone_info *zi = data->mp->m_zone_info; 1035 struct xfs_gc_bio *chunk, *next; 1036 struct xfs_group *reset_list; 1037 struct blk_plug plug; 1038 1039 spin_lock(&zi->zi_reset_list_lock); 1040 reset_list = zi->zi_reset_list; 1041 zi->zi_reset_list = NULL; 1042 spin_unlock(&zi->zi_reset_list_lock); 1043 1044 if (reset_list) { 1045 set_current_state(TASK_RUNNING); 1046 xfs_zone_gc_reset_zones(data, reset_list); 1047 } 1048 1049 list_for_each_entry_safe(chunk, next, &data->resetting, entry) { 1050 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1051 break; 1052 set_current_state(TASK_RUNNING); 1053 xfs_zone_gc_finish_reset(chunk); 1054 } 1055 1056 list_for_each_entry_safe(chunk, next, &data->writing, entry) { 1057 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1058 break; 1059 set_current_state(TASK_RUNNING); 1060 xfs_zone_gc_finish_chunk(chunk); 1061 } 1062 1063 blk_start_plug(&plug); 1064 list_for_each_entry_safe(chunk, next, &data->reading, entry) { 1065 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1066 break; 1067 set_current_state(TASK_RUNNING); 1068 xfs_zone_gc_write_chunk(chunk); 1069 } 1070 blk_finish_plug(&plug); 1071 1072 if (xfs_zone_gc_should_start_new_work(data)) { 1073 set_current_state(TASK_RUNNING); 1074 blk_start_plug(&plug); 1075 while (xfs_zone_gc_start_chunk(data)) 1076 ; 1077 blk_finish_plug(&plug); 1078 } 1079 } 1080 1081 /* 1082 * Note that the current GC algorithm would break reflinks and thus duplicate 1083 * data that was shared by multiple owners before. Because of that reflinks 1084 * are currently not supported on zoned file systems and can't be created or 1085 * mounted. 1086 */ 1087 static int 1088 xfs_zoned_gcd( 1089 void *private) 1090 { 1091 struct xfs_zone_gc_data *data = private; 1092 struct xfs_mount *mp = data->mp; 1093 struct xfs_zone_info *zi = mp->m_zone_info; 1094 unsigned int nofs_flag; 1095 1096 nofs_flag = memalloc_nofs_save(); 1097 set_freezable(); 1098 1099 for (;;) { 1100 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); 1101 xfs_set_zonegc_running(mp); 1102 1103 xfs_zone_gc_handle_work(data); 1104 1105 /* 1106 * Only sleep if nothing set the state to running. Else check for 1107 * work again as someone might have queued up more work and woken 1108 * us in the meantime. 1109 */ 1110 if (get_current_state() == TASK_RUNNING) { 1111 try_to_freeze(); 1112 continue; 1113 } 1114 1115 if (list_empty(&data->reading) && 1116 list_empty(&data->writing) && 1117 list_empty(&data->resetting) && 1118 !zi->zi_reset_list) { 1119 xfs_clear_zonegc_running(mp); 1120 xfs_zoned_resv_wake_all(mp); 1121 1122 if (kthread_should_stop()) { 1123 __set_current_state(TASK_RUNNING); 1124 break; 1125 } 1126 1127 if (kthread_should_park()) { 1128 __set_current_state(TASK_RUNNING); 1129 kthread_parkme(); 1130 continue; 1131 } 1132 } 1133 1134 schedule(); 1135 } 1136 xfs_clear_zonegc_running(mp); 1137 1138 if (data->iter.victim_rtg) 1139 xfs_rtgroup_rele(data->iter.victim_rtg); 1140 1141 memalloc_nofs_restore(nofs_flag); 1142 xfs_zone_gc_data_free(data); 1143 return 0; 1144 } 1145 1146 void 1147 xfs_zone_gc_start( 1148 struct xfs_mount *mp) 1149 { 1150 if (xfs_has_zoned(mp)) 1151 kthread_unpark(mp->m_zone_info->zi_gc_thread); 1152 } 1153 1154 void 1155 xfs_zone_gc_stop( 1156 struct xfs_mount *mp) 1157 { 1158 if (xfs_has_zoned(mp)) 1159 kthread_park(mp->m_zone_info->zi_gc_thread); 1160 } 1161 1162 int 1163 xfs_zone_gc_mount( 1164 struct xfs_mount *mp) 1165 { 1166 struct xfs_zone_info *zi = mp->m_zone_info; 1167 struct xfs_zone_gc_data *data; 1168 struct xfs_open_zone *oz; 1169 int error; 1170 1171 /* 1172 * If there are no free zones available for GC, pick the open zone with 1173 * the least used space to GC into. This should only happen after an 1174 * unclean shutdown near ENOSPC while GC was ongoing. 1175 * 1176 * We also need to do this for the first gc zone allocation if we 1177 * unmounted while at the open limit. 1178 */ 1179 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || 1180 zi->zi_nr_open_zones == mp->m_max_open_zones) 1181 oz = xfs_zone_gc_steal_open(zi); 1182 else 1183 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 1184 if (!oz) { 1185 xfs_warn(mp, "unable to allocate a zone for gc"); 1186 error = -EIO; 1187 goto out; 1188 } 1189 1190 trace_xfs_zone_gc_target_opened(oz->oz_rtg); 1191 zi->zi_open_gc_zone = oz; 1192 1193 data = xfs_zone_gc_data_alloc(mp); 1194 if (!data) { 1195 error = -ENOMEM; 1196 goto out_put_gc_zone; 1197 } 1198 1199 zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, 1200 "xfs-zone-gc/%s", mp->m_super->s_id); 1201 if (IS_ERR(zi->zi_gc_thread)) { 1202 xfs_warn(mp, "unable to create zone gc thread"); 1203 error = PTR_ERR(zi->zi_gc_thread); 1204 goto out_free_gc_data; 1205 } 1206 1207 /* xfs_zone_gc_start will unpark for rw mounts */ 1208 kthread_park(zi->zi_gc_thread); 1209 return 0; 1210 1211 out_free_gc_data: 1212 kfree(data); 1213 out_put_gc_zone: 1214 xfs_open_zone_put(zi->zi_open_gc_zone); 1215 out: 1216 return error; 1217 } 1218 1219 void 1220 xfs_zone_gc_unmount( 1221 struct xfs_mount *mp) 1222 { 1223 struct xfs_zone_info *zi = mp->m_zone_info; 1224 1225 kthread_stop(zi->zi_gc_thread); 1226 if (zi->zi_open_gc_zone) 1227 xfs_open_zone_put(zi->zi_open_gc_zone); 1228 } 1229