1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2023-2025 Christoph Hellwig. 4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. 5 */ 6 #include "xfs_platform.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_inode.h" 13 #include "xfs_btree.h" 14 #include "xfs_trans.h" 15 #include "xfs_icache.h" 16 #include "xfs_rmap.h" 17 #include "xfs_rtbitmap.h" 18 #include "xfs_rtrmap_btree.h" 19 #include "xfs_errortag.h" 20 #include "xfs_error.h" 21 #include "xfs_zone_alloc.h" 22 #include "xfs_zone_priv.h" 23 #include "xfs_zones.h" 24 #include "xfs_trace.h" 25 26 /* 27 * Implement Garbage Collection (GC) of partially used zoned. 28 * 29 * To support the purely sequential writes in each zone, zoned XFS needs to be 30 * able to move data remaining in a zone out of it to reset the zone to prepare 31 * for writing to it again. 32 * 33 * This is done by the GC thread implemented in this file. To support that a 34 * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to 35 * write the garbage collected data into. 36 * 37 * Whenever the available space is below the chosen threshold, the GC thread 38 * looks for potential non-empty but not fully used zones that are worth 39 * reclaiming. Once found the rmap for the victim zone is queried, and after 40 * a bit of sorting to reduce fragmentation, the still live extents are read 41 * into memory and written to the GC target zone, and the bmap btree of the 42 * files is updated to point to the new location. To avoid taking the IOLOCK 43 * and MMAPLOCK for the entire GC process and thus affecting the latency of 44 * user reads and writes to the files, the GC writes are speculative and the 45 * I/O completion checks that no other writes happened for the affected regions 46 * before remapping. 47 * 48 * Once a zone does not contain any valid data, be that through GC or user 49 * block removal, it is queued for for a zone reset. The reset operation 50 * carefully ensures that the RT device cache is flushed and all transactions 51 * referencing the rmap have been committed to disk. 52 */ 53 54 /* 55 * Size of each GC scratch allocation, and the number of buffers. 56 */ 57 #define XFS_GC_BUF_SIZE SZ_1M 58 #define XFS_GC_NR_BUFS 2 59 static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS); 60 61 /* 62 * Chunk that is read and written for each GC operation. 63 * 64 * Note that for writes to actual zoned devices, the chunk can be split when 65 * reaching the hardware limit. 66 */ 67 struct xfs_gc_bio { 68 struct xfs_zone_gc_data *data; 69 70 /* 71 * Entry into the reading/writing/resetting list. Only accessed from 72 * the GC thread, so no locking needed. 73 */ 74 struct list_head entry; 75 76 /* 77 * State of this gc_bio. Done means the current I/O completed. 78 * Set from the bio end I/O handler, read from the GC thread. 79 */ 80 enum { 81 XFS_GC_BIO_NEW, 82 XFS_GC_BIO_DONE, 83 } state; 84 85 /* 86 * Pointer to the inode and byte range in the inode that this 87 * GC chunk is operating on. 88 */ 89 struct xfs_inode *ip; 90 loff_t offset; 91 unsigned int len; 92 93 /* 94 * Existing startblock (in the zone to be freed) and newly assigned 95 * daddr in the zone GCed into. 96 */ 97 xfs_fsblock_t old_startblock; 98 xfs_daddr_t new_daddr; 99 struct xfs_zone_scratch *scratch; 100 101 /* Are we writing to a sequential write required zone? */ 102 bool is_seq; 103 104 /* Open Zone being written to */ 105 struct xfs_open_zone *oz; 106 107 struct xfs_rtgroup *victim_rtg; 108 109 /* Bio used for reads and writes, including the bvec used by it */ 110 struct bio bio; /* must be last */ 111 }; 112 113 #define XFS_ZONE_GC_RECS 1024 114 115 /* iterator, needs to be reinitialized for each victim zone */ 116 struct xfs_zone_gc_iter { 117 struct xfs_rtgroup *victim_rtg; 118 unsigned int rec_count; 119 unsigned int rec_idx; 120 xfs_agblock_t next_startblock; 121 struct xfs_rmap_irec *recs; 122 }; 123 124 /* 125 * Per-mount GC state. 126 */ 127 struct xfs_zone_gc_data { 128 struct xfs_mount *mp; 129 130 /* bioset used to allocate the gc_bios */ 131 struct bio_set bio_set; 132 133 /* 134 * Scratchpad to buffer GC data, organized as a ring buffer over 135 * discontiguous folios. scratch_head is where the buffer is filled, 136 * scratch_tail tracks the buffer space freed, and scratch_available 137 * counts the space available in the ring buffer between the head and 138 * the tail. 139 */ 140 struct folio *scratch_folios[XFS_GC_NR_BUFS]; 141 unsigned int scratch_size; 142 unsigned int scratch_available; 143 unsigned int scratch_head; 144 unsigned int scratch_tail; 145 146 /* 147 * List of bios currently being read, written and reset. 148 * These lists are only accessed by the GC thread itself, and must only 149 * be processed in order. 150 */ 151 struct list_head reading; 152 struct list_head writing; 153 struct list_head resetting; 154 155 /* 156 * Iterator for the victim zone. 157 */ 158 struct xfs_zone_gc_iter iter; 159 }; 160 161 /* 162 * We aim to keep enough zones free in stock to fully use the open zone limit 163 * for data placement purposes. Additionally, the m_zonegc_low_space tunable 164 * can be set to make sure a fraction of the unused blocks are available for 165 * writing. 166 */ 167 bool 168 xfs_zoned_need_gc( 169 struct xfs_mount *mp) 170 { 171 s64 available, free, threshold; 172 s32 remainder; 173 174 if (!xfs_zoned_have_reclaimable(mp->m_zone_info)) 175 return false; 176 177 available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); 178 179 if (available < 180 xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) 181 return true; 182 183 free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); 184 185 threshold = div_s64_rem(free, 100, &remainder); 186 threshold = threshold * mp->m_zonegc_low_space + 187 remainder * div_s64(mp->m_zonegc_low_space, 100); 188 189 if (available < threshold) 190 return true; 191 192 return false; 193 } 194 195 static struct xfs_zone_gc_data * 196 xfs_zone_gc_data_alloc( 197 struct xfs_mount *mp) 198 { 199 struct xfs_zone_gc_data *data; 200 int i; 201 202 data = kzalloc(sizeof(*data), GFP_KERNEL); 203 if (!data) 204 return NULL; 205 data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs), 206 GFP_KERNEL); 207 if (!data->iter.recs) 208 goto out_free_data; 209 210 if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio), 211 BIOSET_NEED_BVECS)) 212 goto out_free_recs; 213 for (i = 0; i < XFS_GC_NR_BUFS; i++) { 214 data->scratch_folios[i] = 215 folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE)); 216 if (!data->scratch_folios[i]) 217 goto out_free_scratch; 218 } 219 data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS; 220 data->scratch_available = data->scratch_size; 221 INIT_LIST_HEAD(&data->reading); 222 INIT_LIST_HEAD(&data->writing); 223 INIT_LIST_HEAD(&data->resetting); 224 data->mp = mp; 225 return data; 226 227 out_free_scratch: 228 while (--i >= 0) 229 folio_put(data->scratch_folios[i]); 230 bioset_exit(&data->bio_set); 231 out_free_recs: 232 kfree(data->iter.recs); 233 out_free_data: 234 kfree(data); 235 return NULL; 236 } 237 238 static void 239 xfs_zone_gc_data_free( 240 struct xfs_zone_gc_data *data) 241 { 242 int i; 243 244 for (i = 0; i < XFS_GC_NR_BUFS; i++) 245 folio_put(data->scratch_folios[i]); 246 bioset_exit(&data->bio_set); 247 kfree(data->iter.recs); 248 kfree(data); 249 } 250 251 static void 252 xfs_zone_gc_iter_init( 253 struct xfs_zone_gc_iter *iter, 254 struct xfs_rtgroup *victim_rtg) 255 256 { 257 iter->next_startblock = 0; 258 iter->rec_count = 0; 259 iter->rec_idx = 0; 260 iter->victim_rtg = victim_rtg; 261 atomic_inc(&victim_rtg->rtg_gccount); 262 } 263 264 /* 265 * Query the rmap of the victim zone to gather the records to evacuate. 266 */ 267 static int 268 xfs_zone_gc_query_cb( 269 struct xfs_btree_cur *cur, 270 const struct xfs_rmap_irec *irec, 271 void *private) 272 { 273 struct xfs_zone_gc_iter *iter = private; 274 275 ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)); 276 ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner)); 277 ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))); 278 279 iter->recs[iter->rec_count] = *irec; 280 if (++iter->rec_count == XFS_ZONE_GC_RECS) { 281 iter->next_startblock = 282 irec->rm_startblock + irec->rm_blockcount; 283 return 1; 284 } 285 return 0; 286 } 287 288 static int 289 xfs_zone_gc_rmap_rec_cmp( 290 const void *a, 291 const void *b) 292 { 293 const struct xfs_rmap_irec *reca = a; 294 const struct xfs_rmap_irec *recb = b; 295 int diff; 296 297 diff = cmp_int(reca->rm_owner, recb->rm_owner); 298 if (diff) 299 return diff; 300 return cmp_int(reca->rm_offset, recb->rm_offset); 301 } 302 303 static int 304 xfs_zone_gc_query( 305 struct xfs_mount *mp, 306 struct xfs_zone_gc_iter *iter) 307 { 308 struct xfs_rtgroup *rtg = iter->victim_rtg; 309 struct xfs_rmap_irec ri_low = { }; 310 struct xfs_rmap_irec ri_high; 311 struct xfs_btree_cur *cur; 312 struct xfs_trans *tp; 313 int error; 314 315 ASSERT(iter->next_startblock <= rtg_blocks(rtg)); 316 if (iter->next_startblock == rtg_blocks(rtg)) 317 goto done; 318 319 ASSERT(iter->next_startblock < rtg_blocks(rtg)); 320 ri_low.rm_startblock = iter->next_startblock; 321 memset(&ri_high, 0xFF, sizeof(ri_high)); 322 323 iter->rec_idx = 0; 324 iter->rec_count = 0; 325 326 tp = xfs_trans_alloc_empty(mp); 327 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 328 cur = xfs_rtrmapbt_init_cursor(tp, rtg); 329 error = xfs_rmap_query_range(cur, &ri_low, &ri_high, 330 xfs_zone_gc_query_cb, iter); 331 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 332 xfs_btree_del_cursor(cur, error < 0 ? error : 0); 333 xfs_trans_cancel(tp); 334 335 if (error < 0) 336 return error; 337 338 /* 339 * Sort the rmap records by inode number and increasing offset to 340 * defragment the mappings. 341 * 342 * This could be further enhanced by an even bigger look ahead window, 343 * but that's better left until we have better detection of changes to 344 * inode mapping to avoid the potential of GCing already dead data. 345 */ 346 sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]), 347 xfs_zone_gc_rmap_rec_cmp, NULL); 348 349 if (error == 0) { 350 /* 351 * We finished iterating through the zone. 352 */ 353 iter->next_startblock = rtg_blocks(rtg); 354 if (iter->rec_count == 0) 355 goto done; 356 } 357 358 return 0; 359 done: 360 atomic_dec(&iter->victim_rtg->rtg_gccount); 361 xfs_rtgroup_rele(iter->victim_rtg); 362 iter->victim_rtg = NULL; 363 return 0; 364 } 365 366 static bool 367 xfs_zone_gc_iter_next( 368 struct xfs_mount *mp, 369 struct xfs_zone_gc_iter *iter, 370 struct xfs_rmap_irec *chunk_rec, 371 struct xfs_inode **ipp) 372 { 373 struct xfs_rmap_irec *irec; 374 int error; 375 376 if (!iter->victim_rtg) 377 return false; 378 379 retry: 380 if (iter->rec_idx == iter->rec_count) { 381 error = xfs_zone_gc_query(mp, iter); 382 if (error) 383 goto fail; 384 if (!iter->victim_rtg) 385 return false; 386 } 387 388 irec = &iter->recs[iter->rec_idx]; 389 error = xfs_iget(mp, NULL, irec->rm_owner, 390 XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp); 391 if (error) { 392 /* 393 * If the inode was already deleted, skip over it. 394 */ 395 if (error == -ENOENT) { 396 iter->rec_idx++; 397 goto retry; 398 } 399 goto fail; 400 } 401 402 if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) { 403 iter->rec_idx++; 404 xfs_irele(*ipp); 405 goto retry; 406 } 407 408 *chunk_rec = *irec; 409 return true; 410 411 fail: 412 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 413 return false; 414 } 415 416 static void 417 xfs_zone_gc_iter_advance( 418 struct xfs_zone_gc_iter *iter, 419 xfs_extlen_t count_fsb) 420 { 421 struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx]; 422 423 irec->rm_offset += count_fsb; 424 irec->rm_startblock += count_fsb; 425 irec->rm_blockcount -= count_fsb; 426 if (!irec->rm_blockcount) 427 iter->rec_idx++; 428 } 429 430 static struct xfs_rtgroup * 431 xfs_zone_gc_pick_victim_from( 432 struct xfs_mount *mp, 433 uint32_t bucket) 434 { 435 struct xfs_zone_info *zi = mp->m_zone_info; 436 uint32_t victim_used = U32_MAX; 437 struct xfs_rtgroup *victim_rtg = NULL; 438 uint32_t bit; 439 440 if (!zi->zi_used_bucket_entries[bucket]) 441 return NULL; 442 443 for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket], 444 mp->m_sb.sb_rgcount) { 445 struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit); 446 447 if (!rtg) 448 continue; 449 450 /* 451 * If the zone is already undergoing GC, don't pick it again. 452 * 453 * This prevents us from picking one of the zones for which we 454 * already submitted GC I/O, but for which the remapping hasn't 455 * concluded yet. This won't cause data corruption, but 456 * increases write amplification and slows down GC, so this is 457 * a bad thing. 458 */ 459 if (atomic_read(&rtg->rtg_gccount)) { 460 xfs_rtgroup_rele(rtg); 461 continue; 462 } 463 464 /* skip zones that are just waiting for a reset */ 465 if (rtg_rmap(rtg)->i_used_blocks == 0 || 466 rtg_rmap(rtg)->i_used_blocks >= victim_used) { 467 xfs_rtgroup_rele(rtg); 468 continue; 469 } 470 471 if (victim_rtg) 472 xfs_rtgroup_rele(victim_rtg); 473 victim_rtg = rtg; 474 victim_used = rtg_rmap(rtg)->i_used_blocks; 475 476 /* 477 * Any zone that is less than 1 percent used is fair game for 478 * instant reclaim. All of these zones are in the last 479 * bucket, so avoid the expensive division for the zones 480 * in the other buckets. 481 */ 482 if (bucket == 0 && 483 rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100) 484 break; 485 } 486 487 return victim_rtg; 488 } 489 490 /* 491 * Iterate through all zones marked as reclaimable and find a candidate to 492 * reclaim. 493 */ 494 static bool 495 xfs_zone_gc_select_victim( 496 struct xfs_zone_gc_data *data) 497 { 498 struct xfs_zone_gc_iter *iter = &data->iter; 499 struct xfs_mount *mp = data->mp; 500 struct xfs_zone_info *zi = mp->m_zone_info; 501 struct xfs_rtgroup *victim_rtg = NULL; 502 unsigned int bucket; 503 504 spin_lock(&zi->zi_used_buckets_lock); 505 for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { 506 victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); 507 if (victim_rtg) 508 break; 509 } 510 spin_unlock(&zi->zi_used_buckets_lock); 511 512 if (!victim_rtg) 513 return false; 514 515 trace_xfs_zone_gc_select_victim(victim_rtg, bucket); 516 xfs_zone_gc_iter_init(iter, victim_rtg); 517 return true; 518 } 519 520 static struct xfs_open_zone * 521 xfs_zone_gc_steal_open( 522 struct xfs_zone_info *zi) 523 { 524 struct xfs_open_zone *oz, *found = NULL; 525 526 spin_lock(&zi->zi_open_zones_lock); 527 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { 528 if (!found || oz->oz_allocated < found->oz_allocated) 529 found = oz; 530 } 531 532 if (found) { 533 found->oz_is_gc = true; 534 list_del_init(&found->oz_entry); 535 zi->zi_nr_open_zones--; 536 } 537 538 spin_unlock(&zi->zi_open_zones_lock); 539 return found; 540 } 541 542 static struct xfs_open_zone * 543 xfs_zone_gc_select_target( 544 struct xfs_mount *mp) 545 { 546 struct xfs_zone_info *zi = mp->m_zone_info; 547 struct xfs_open_zone *oz = zi->zi_open_gc_zone; 548 549 /* 550 * We need to wait for pending writes to finish. 551 */ 552 if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg)) 553 return NULL; 554 555 ASSERT(zi->zi_nr_open_zones <= 556 mp->m_max_open_zones - XFS_OPEN_GC_ZONES); 557 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 558 if (oz) 559 trace_xfs_zone_gc_target_opened(oz->oz_rtg); 560 spin_lock(&zi->zi_open_zones_lock); 561 zi->zi_open_gc_zone = oz; 562 spin_unlock(&zi->zi_open_zones_lock); 563 return oz; 564 } 565 566 /* 567 * Ensure we have a valid open zone to write the GC data to. 568 * 569 * If the current target zone has space keep writing to it, else first wait for 570 * all pending writes and then pick a new one. 571 */ 572 static struct xfs_open_zone * 573 xfs_zone_gc_ensure_target( 574 struct xfs_mount *mp) 575 { 576 struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; 577 578 if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg)) 579 return xfs_zone_gc_select_target(mp); 580 return oz; 581 } 582 583 static void 584 xfs_zone_gc_end_io( 585 struct bio *bio) 586 { 587 struct xfs_gc_bio *chunk = 588 container_of(bio, struct xfs_gc_bio, bio); 589 struct xfs_zone_gc_data *data = chunk->data; 590 591 WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE); 592 wake_up_process(data->mp->m_zone_info->zi_gc_thread); 593 } 594 595 static struct xfs_open_zone * 596 xfs_zone_gc_alloc_blocks( 597 struct xfs_zone_gc_data *data, 598 xfs_extlen_t *count_fsb, 599 xfs_daddr_t *daddr, 600 bool *is_seq) 601 { 602 struct xfs_mount *mp = data->mp; 603 struct xfs_open_zone *oz; 604 605 oz = xfs_zone_gc_ensure_target(mp); 606 if (!oz) 607 return NULL; 608 609 *count_fsb = min(*count_fsb, XFS_B_TO_FSB(mp, data->scratch_available)); 610 611 /* 612 * Directly allocate GC blocks from the reserved pool. 613 * 614 * If we'd take them from the normal pool we could be stealing blocks 615 * from a regular writer, which would then have to wait for GC and 616 * deadlock. 617 */ 618 spin_lock(&mp->m_sb_lock); 619 *count_fsb = min(*count_fsb, 620 rtg_blocks(oz->oz_rtg) - oz->oz_allocated); 621 *count_fsb = min3(*count_fsb, 622 mp->m_free[XC_FREE_RTEXTENTS].res_avail, 623 mp->m_free[XC_FREE_RTAVAILABLE].res_avail); 624 mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb; 625 mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb; 626 spin_unlock(&mp->m_sb_lock); 627 628 if (!*count_fsb) 629 return NULL; 630 631 *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0); 632 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); 633 if (!*is_seq) 634 *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated); 635 oz->oz_allocated += *count_fsb; 636 atomic_inc(&oz->oz_ref); 637 return oz; 638 } 639 640 static void 641 xfs_zone_gc_add_data( 642 struct xfs_gc_bio *chunk) 643 { 644 struct xfs_zone_gc_data *data = chunk->data; 645 unsigned int len = chunk->len; 646 unsigned int off = data->scratch_head; 647 648 do { 649 unsigned int this_off = off % XFS_GC_BUF_SIZE; 650 unsigned int this_len = min(len, XFS_GC_BUF_SIZE - this_off); 651 652 bio_add_folio_nofail(&chunk->bio, 653 data->scratch_folios[off / XFS_GC_BUF_SIZE], 654 this_len, this_off); 655 len -= this_len; 656 off += this_len; 657 if (off == data->scratch_size) 658 off = 0; 659 } while (len); 660 } 661 662 static bool 663 xfs_zone_gc_start_chunk( 664 struct xfs_zone_gc_data *data) 665 { 666 struct xfs_zone_gc_iter *iter = &data->iter; 667 struct xfs_mount *mp = data->mp; 668 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 669 struct xfs_open_zone *oz; 670 struct xfs_rmap_irec irec; 671 struct xfs_gc_bio *chunk; 672 struct xfs_inode *ip; 673 struct bio *bio; 674 xfs_daddr_t daddr; 675 unsigned int len; 676 bool is_seq; 677 678 if (xfs_is_shutdown(mp)) 679 return false; 680 681 if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip)) 682 return false; 683 oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, 684 &is_seq); 685 if (!oz) { 686 xfs_irele(ip); 687 return false; 688 } 689 690 len = XFS_FSB_TO_B(mp, irec.rm_blockcount); 691 bio = bio_alloc_bioset(bdev, 692 min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS), 693 REQ_OP_READ, GFP_NOFS, &data->bio_set); 694 695 chunk = container_of(bio, struct xfs_gc_bio, bio); 696 chunk->ip = ip; 697 chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); 698 chunk->len = len; 699 chunk->old_startblock = 700 xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); 701 chunk->new_daddr = daddr; 702 chunk->is_seq = is_seq; 703 chunk->data = data; 704 chunk->oz = oz; 705 chunk->victim_rtg = iter->victim_rtg; 706 atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); 707 atomic_inc(&chunk->victim_rtg->rtg_gccount); 708 709 bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); 710 bio->bi_end_io = xfs_zone_gc_end_io; 711 xfs_zone_gc_add_data(chunk); 712 data->scratch_head = (data->scratch_head + len) % data->scratch_size; 713 data->scratch_available -= len; 714 715 XFS_STATS_INC(mp, xs_gc_read_calls); 716 717 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 718 list_add_tail(&chunk->entry, &data->reading); 719 xfs_zone_gc_iter_advance(iter, irec.rm_blockcount); 720 721 submit_bio(bio); 722 return true; 723 } 724 725 static void 726 xfs_zone_gc_free_chunk( 727 struct xfs_gc_bio *chunk) 728 { 729 atomic_dec(&chunk->victim_rtg->rtg_gccount); 730 xfs_rtgroup_rele(chunk->victim_rtg); 731 list_del(&chunk->entry); 732 xfs_open_zone_put(chunk->oz); 733 xfs_irele(chunk->ip); 734 bio_put(&chunk->bio); 735 } 736 737 static void 738 xfs_zone_gc_submit_write( 739 struct xfs_zone_gc_data *data, 740 struct xfs_gc_bio *chunk) 741 { 742 if (chunk->is_seq) { 743 chunk->bio.bi_opf &= ~REQ_OP_WRITE; 744 chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND; 745 } 746 chunk->bio.bi_iter.bi_sector = chunk->new_daddr; 747 chunk->bio.bi_end_io = xfs_zone_gc_end_io; 748 submit_bio(&chunk->bio); 749 } 750 751 static struct xfs_gc_bio * 752 xfs_zone_gc_split_write( 753 struct xfs_zone_gc_data *data, 754 struct xfs_gc_bio *chunk) 755 { 756 struct queue_limits *lim = 757 &bdev_get_queue(chunk->bio.bi_bdev)->limits; 758 struct xfs_gc_bio *split_chunk; 759 int split_sectors; 760 unsigned int split_len; 761 struct bio *split; 762 unsigned int nsegs; 763 764 if (!chunk->is_seq) 765 return NULL; 766 767 split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs, 768 lim->max_zone_append_sectors << SECTOR_SHIFT); 769 if (!split_sectors) 770 return NULL; 771 772 /* ensure the split chunk is still block size aligned */ 773 split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT, 774 data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT; 775 split_len = split_sectors << SECTOR_SHIFT; 776 777 split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set); 778 split_chunk = container_of(split, struct xfs_gc_bio, bio); 779 split_chunk->data = data; 780 ihold(VFS_I(chunk->ip)); 781 split_chunk->ip = chunk->ip; 782 split_chunk->is_seq = chunk->is_seq; 783 split_chunk->scratch = chunk->scratch; 784 split_chunk->offset = chunk->offset; 785 split_chunk->len = split_len; 786 split_chunk->old_startblock = chunk->old_startblock; 787 split_chunk->new_daddr = chunk->new_daddr; 788 split_chunk->oz = chunk->oz; 789 atomic_inc(&chunk->oz->oz_ref); 790 791 split_chunk->victim_rtg = chunk->victim_rtg; 792 atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref); 793 atomic_inc(&chunk->victim_rtg->rtg_gccount); 794 795 chunk->offset += split_len; 796 chunk->len -= split_len; 797 chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); 798 799 /* add right before the original chunk */ 800 WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW); 801 list_add_tail(&split_chunk->entry, &chunk->entry); 802 return split_chunk; 803 } 804 805 static void 806 xfs_zone_gc_write_chunk( 807 struct xfs_gc_bio *chunk) 808 { 809 struct xfs_zone_gc_data *data = chunk->data; 810 struct xfs_mount *mp = chunk->ip->i_mount; 811 struct xfs_gc_bio *split_chunk; 812 813 if (chunk->bio.bi_status) 814 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 815 if (xfs_is_shutdown(mp)) { 816 xfs_zone_gc_free_chunk(chunk); 817 return; 818 } 819 820 XFS_STATS_INC(mp, xs_gc_write_calls); 821 XFS_STATS_ADD(mp, xs_gc_bytes, chunk->len); 822 823 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 824 list_move_tail(&chunk->entry, &data->writing); 825 826 bio_reuse(&chunk->bio, REQ_OP_WRITE); 827 while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) 828 xfs_zone_gc_submit_write(data, split_chunk); 829 xfs_zone_gc_submit_write(data, chunk); 830 } 831 832 static void 833 xfs_zone_gc_finish_chunk( 834 struct xfs_gc_bio *chunk) 835 { 836 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 837 struct xfs_zone_gc_data *data = chunk->data; 838 struct xfs_inode *ip = chunk->ip; 839 struct xfs_mount *mp = ip->i_mount; 840 int error; 841 842 if (chunk->bio.bi_status) 843 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 844 if (xfs_is_shutdown(mp)) { 845 xfs_zone_gc_free_chunk(chunk); 846 return; 847 } 848 849 data->scratch_tail = 850 (data->scratch_tail + chunk->len) % data->scratch_size; 851 data->scratch_available += chunk->len; 852 853 /* 854 * Cycle through the iolock and wait for direct I/O and layouts to 855 * ensure no one is reading from the old mapping before it goes away. 856 * 857 * Note that xfs_zoned_end_io() below checks that no other writer raced 858 * with us to update the mapping by checking that the old startblock 859 * didn't change. 860 */ 861 xfs_ilock(ip, iolock); 862 error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP); 863 if (!error) 864 inode_dio_wait(VFS_I(ip)); 865 xfs_iunlock(ip, iolock); 866 if (error) 867 goto free; 868 869 if (chunk->is_seq) 870 chunk->new_daddr = chunk->bio.bi_iter.bi_sector; 871 error = xfs_zoned_end_io(ip, chunk->offset, chunk->len, 872 chunk->new_daddr, chunk->oz, chunk->old_startblock); 873 free: 874 if (error) 875 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 876 xfs_zone_gc_free_chunk(chunk); 877 } 878 879 static void 880 xfs_zone_gc_finish_reset( 881 struct xfs_gc_bio *chunk) 882 { 883 struct xfs_rtgroup *rtg = chunk->bio.bi_private; 884 struct xfs_mount *mp = rtg_mount(rtg); 885 struct xfs_zone_info *zi = mp->m_zone_info; 886 887 if (chunk->bio.bi_status) { 888 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 889 goto out; 890 } 891 892 xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); 893 atomic_inc(&zi->zi_nr_free_zones); 894 895 xfs_zoned_add_available(mp, rtg_blocks(rtg)); 896 897 wake_up_all(&zi->zi_zone_wait); 898 out: 899 list_del(&chunk->entry); 900 bio_put(&chunk->bio); 901 } 902 903 static void 904 xfs_submit_zone_reset_bio( 905 struct xfs_rtgroup *rtg, 906 struct bio *bio) 907 { 908 struct xfs_mount *mp = rtg_mount(rtg); 909 910 trace_xfs_zone_reset(rtg); 911 912 ASSERT(rtg_rmap(rtg)->i_used_blocks == 0); 913 914 if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ZONE_RESET)) { 915 bio_io_error(bio); 916 return; 917 } 918 919 XFS_STATS_INC(mp, xs_gc_zone_reset_calls); 920 921 bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); 922 if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { 923 /* 924 * Also use the bio to drive the state machine when neither 925 * zone reset nor discard is supported to keep things simple. 926 */ 927 if (!bdev_max_discard_sectors(bio->bi_bdev)) { 928 bio_endio(bio); 929 return; 930 } 931 bio->bi_opf &= ~REQ_OP_ZONE_RESET; 932 bio->bi_opf |= REQ_OP_DISCARD; 933 bio->bi_iter.bi_size = XFS_FSB_TO_B(mp, rtg_blocks(rtg)); 934 } 935 936 submit_bio(bio); 937 } 938 939 static void xfs_bio_wait_endio(struct bio *bio) 940 { 941 complete(bio->bi_private); 942 } 943 944 int 945 xfs_zone_gc_reset_sync( 946 struct xfs_rtgroup *rtg) 947 { 948 DECLARE_COMPLETION_ONSTACK(done); 949 struct bio bio; 950 int error; 951 952 bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, 953 REQ_OP_ZONE_RESET | REQ_SYNC); 954 bio.bi_private = &done; 955 bio.bi_end_io = xfs_bio_wait_endio; 956 xfs_submit_zone_reset_bio(rtg, &bio); 957 wait_for_completion_io(&done); 958 959 error = blk_status_to_errno(bio.bi_status); 960 bio_uninit(&bio); 961 return error; 962 } 963 964 static void 965 xfs_zone_gc_reset_zones( 966 struct xfs_zone_gc_data *data, 967 struct xfs_group *reset_list) 968 { 969 struct xfs_group *next = reset_list; 970 971 if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) { 972 xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR); 973 return; 974 } 975 976 do { 977 struct xfs_rtgroup *rtg = to_rtg(next); 978 struct xfs_gc_bio *chunk; 979 struct bio *bio; 980 981 xfs_log_force_inode(rtg_rmap(rtg)); 982 983 next = rtg_group(rtg)->xg_next_reset; 984 rtg_group(rtg)->xg_next_reset = NULL; 985 986 bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, 987 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set); 988 bio->bi_private = rtg; 989 bio->bi_end_io = xfs_zone_gc_end_io; 990 991 chunk = container_of(bio, struct xfs_gc_bio, bio); 992 chunk->data = data; 993 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 994 list_add_tail(&chunk->entry, &data->resetting); 995 xfs_submit_zone_reset_bio(rtg, bio); 996 } while (next); 997 } 998 999 static bool 1000 xfs_zone_gc_should_start_new_work( 1001 struct xfs_zone_gc_data *data) 1002 { 1003 struct xfs_open_zone *oz; 1004 1005 if (xfs_is_shutdown(data->mp)) 1006 return false; 1007 if (!data->scratch_available) 1008 return false; 1009 1010 oz = xfs_zone_gc_ensure_target(data->mp); 1011 if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg)) 1012 return false; 1013 1014 if (!data->iter.victim_rtg) { 1015 if (kthread_should_stop() || kthread_should_park()) 1016 return false; 1017 if (!xfs_zoned_need_gc(data->mp)) 1018 return false; 1019 if (!xfs_zone_gc_select_victim(data)) 1020 return false; 1021 } 1022 1023 return true; 1024 } 1025 1026 /* 1027 * Handle the work to read and write data for GC and to reset the zones, 1028 * including handling all completions. 1029 * 1030 * Note that the order of the chunks is preserved so that we don't undo the 1031 * optimal order established by xfs_zone_gc_query(). 1032 */ 1033 static void 1034 xfs_zone_gc_handle_work( 1035 struct xfs_zone_gc_data *data) 1036 { 1037 struct xfs_zone_info *zi = data->mp->m_zone_info; 1038 struct xfs_gc_bio *chunk, *next; 1039 struct xfs_group *reset_list; 1040 struct blk_plug plug; 1041 1042 spin_lock(&zi->zi_reset_list_lock); 1043 reset_list = zi->zi_reset_list; 1044 zi->zi_reset_list = NULL; 1045 spin_unlock(&zi->zi_reset_list_lock); 1046 1047 if (reset_list) { 1048 set_current_state(TASK_RUNNING); 1049 xfs_zone_gc_reset_zones(data, reset_list); 1050 } 1051 1052 list_for_each_entry_safe(chunk, next, &data->resetting, entry) { 1053 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1054 break; 1055 set_current_state(TASK_RUNNING); 1056 xfs_zone_gc_finish_reset(chunk); 1057 } 1058 1059 list_for_each_entry_safe(chunk, next, &data->writing, entry) { 1060 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1061 break; 1062 set_current_state(TASK_RUNNING); 1063 xfs_zone_gc_finish_chunk(chunk); 1064 } 1065 1066 blk_start_plug(&plug); 1067 list_for_each_entry_safe(chunk, next, &data->reading, entry) { 1068 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1069 break; 1070 set_current_state(TASK_RUNNING); 1071 xfs_zone_gc_write_chunk(chunk); 1072 } 1073 blk_finish_plug(&plug); 1074 1075 if (xfs_zone_gc_should_start_new_work(data)) { 1076 set_current_state(TASK_RUNNING); 1077 blk_start_plug(&plug); 1078 while (xfs_zone_gc_start_chunk(data)) 1079 ; 1080 blk_finish_plug(&plug); 1081 } 1082 } 1083 1084 /* 1085 * Note that the current GC algorithm would break reflinks and thus duplicate 1086 * data that was shared by multiple owners before. Because of that reflinks 1087 * are currently not supported on zoned file systems and can't be created or 1088 * mounted. 1089 */ 1090 static int 1091 xfs_zoned_gcd( 1092 void *private) 1093 { 1094 struct xfs_zone_gc_data *data = private; 1095 struct xfs_mount *mp = data->mp; 1096 struct xfs_zone_info *zi = mp->m_zone_info; 1097 unsigned int nofs_flag; 1098 1099 nofs_flag = memalloc_nofs_save(); 1100 set_freezable(); 1101 1102 for (;;) { 1103 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); 1104 xfs_set_zonegc_running(mp); 1105 1106 xfs_zone_gc_handle_work(data); 1107 1108 /* 1109 * Only sleep if nothing set the state to running. Else check for 1110 * work again as someone might have queued up more work and woken 1111 * us in the meantime. 1112 */ 1113 if (get_current_state() == TASK_RUNNING) { 1114 try_to_freeze(); 1115 continue; 1116 } 1117 1118 if (list_empty(&data->reading) && 1119 list_empty(&data->writing) && 1120 list_empty(&data->resetting) && 1121 !zi->zi_reset_list) { 1122 xfs_clear_zonegc_running(mp); 1123 xfs_zoned_resv_wake_all(mp); 1124 1125 if (kthread_should_stop()) { 1126 __set_current_state(TASK_RUNNING); 1127 break; 1128 } 1129 1130 if (kthread_should_park()) { 1131 __set_current_state(TASK_RUNNING); 1132 kthread_parkme(); 1133 continue; 1134 } 1135 } 1136 1137 schedule(); 1138 } 1139 xfs_clear_zonegc_running(mp); 1140 1141 if (data->iter.victim_rtg) 1142 xfs_rtgroup_rele(data->iter.victim_rtg); 1143 1144 memalloc_nofs_restore(nofs_flag); 1145 xfs_zone_gc_data_free(data); 1146 return 0; 1147 } 1148 1149 void 1150 xfs_zone_gc_start( 1151 struct xfs_mount *mp) 1152 { 1153 if (xfs_has_zoned(mp)) 1154 kthread_unpark(mp->m_zone_info->zi_gc_thread); 1155 } 1156 1157 void 1158 xfs_zone_gc_stop( 1159 struct xfs_mount *mp) 1160 { 1161 if (xfs_has_zoned(mp)) 1162 kthread_park(mp->m_zone_info->zi_gc_thread); 1163 } 1164 1165 int 1166 xfs_zone_gc_mount( 1167 struct xfs_mount *mp) 1168 { 1169 struct xfs_zone_info *zi = mp->m_zone_info; 1170 struct xfs_zone_gc_data *data; 1171 struct xfs_open_zone *oz; 1172 int error; 1173 1174 /* 1175 * If there are no free zones available for GC, pick the open zone with 1176 * the least used space to GC into. This should only happen after an 1177 * unclean shutdown near ENOSPC while GC was ongoing. 1178 * 1179 * We also need to do this for the first gc zone allocation if we 1180 * unmounted while at the open limit. 1181 */ 1182 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || 1183 zi->zi_nr_open_zones == mp->m_max_open_zones) 1184 oz = xfs_zone_gc_steal_open(zi); 1185 else 1186 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 1187 if (!oz) { 1188 xfs_warn(mp, "unable to allocate a zone for gc"); 1189 error = -EIO; 1190 goto out; 1191 } 1192 1193 trace_xfs_zone_gc_target_opened(oz->oz_rtg); 1194 zi->zi_open_gc_zone = oz; 1195 1196 data = xfs_zone_gc_data_alloc(mp); 1197 if (!data) { 1198 error = -ENOMEM; 1199 goto out_put_gc_zone; 1200 } 1201 1202 zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, 1203 "xfs-zone-gc/%s", mp->m_super->s_id); 1204 if (IS_ERR(zi->zi_gc_thread)) { 1205 xfs_warn(mp, "unable to create zone gc thread"); 1206 error = PTR_ERR(zi->zi_gc_thread); 1207 goto out_free_gc_data; 1208 } 1209 1210 /* xfs_zone_gc_start will unpark for rw mounts */ 1211 kthread_park(zi->zi_gc_thread); 1212 return 0; 1213 1214 out_free_gc_data: 1215 kfree(data); 1216 out_put_gc_zone: 1217 xfs_open_zone_put(zi->zi_open_gc_zone); 1218 out: 1219 return error; 1220 } 1221 1222 void 1223 xfs_zone_gc_unmount( 1224 struct xfs_mount *mp) 1225 { 1226 struct xfs_zone_info *zi = mp->m_zone_info; 1227 1228 kthread_stop(zi->zi_gc_thread); 1229 if (zi->zi_open_gc_zone) 1230 xfs_open_zone_put(zi->zi_open_gc_zone); 1231 } 1232