1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2023-2025 Christoph Hellwig. 4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. 5 */ 6 #include "xfs.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_inode.h" 13 #include "xfs_btree.h" 14 #include "xfs_trans.h" 15 #include "xfs_icache.h" 16 #include "xfs_rmap.h" 17 #include "xfs_rtbitmap.h" 18 #include "xfs_rtrmap_btree.h" 19 #include "xfs_zone_alloc.h" 20 #include "xfs_zone_priv.h" 21 #include "xfs_zones.h" 22 #include "xfs_trace.h" 23 24 /* 25 * Implement Garbage Collection (GC) of partially used zoned. 26 * 27 * To support the purely sequential writes in each zone, zoned XFS needs to be 28 * able to move data remaining in a zone out of it to reset the zone to prepare 29 * for writing to it again. 30 * 31 * This is done by the GC thread implemented in this file. To support that a 32 * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to 33 * write the garbage collected data into. 34 * 35 * Whenever the available space is below the chosen threshold, the GC thread 36 * looks for potential non-empty but not fully used zones that are worth 37 * reclaiming. Once found the rmap for the victim zone is queried, and after 38 * a bit of sorting to reduce fragmentation, the still live extents are read 39 * into memory and written to the GC target zone, and the bmap btree of the 40 * files is updated to point to the new location. To avoid taking the IOLOCK 41 * and MMAPLOCK for the entire GC process and thus affecting the latency of 42 * user reads and writes to the files, the GC writes are speculative and the 43 * I/O completion checks that no other writes happened for the affected regions 44 * before remapping. 45 * 46 * Once a zone does not contain any valid data, be that through GC or user 47 * block removal, it is queued for for a zone reset. The reset operation 48 * carefully ensures that the RT device cache is flushed and all transactions 49 * referencing the rmap have been committed to disk. 50 */ 51 52 /* 53 * Size of each GC scratch pad. This is also the upper bound for each 54 * GC I/O, which helps to keep latency down. 55 */ 56 #define XFS_GC_CHUNK_SIZE SZ_1M 57 58 /* 59 * Scratchpad data to read GCed data into. 60 * 61 * The offset member tracks where the next allocation starts, and freed tracks 62 * the amount of space that is not used anymore. 63 */ 64 #define XFS_ZONE_GC_NR_SCRATCH 2 65 struct xfs_zone_scratch { 66 struct folio *folio; 67 unsigned int offset; 68 unsigned int freed; 69 }; 70 71 /* 72 * Chunk that is read and written for each GC operation. 73 * 74 * Note that for writes to actual zoned devices, the chunk can be split when 75 * reaching the hardware limit. 76 */ 77 struct xfs_gc_bio { 78 struct xfs_zone_gc_data *data; 79 80 /* 81 * Entry into the reading/writing/resetting list. Only accessed from 82 * the GC thread, so no locking needed. 83 */ 84 struct list_head entry; 85 86 /* 87 * State of this gc_bio. Done means the current I/O completed. 88 * Set from the bio end I/O handler, read from the GC thread. 89 */ 90 enum { 91 XFS_GC_BIO_NEW, 92 XFS_GC_BIO_DONE, 93 } state; 94 95 /* 96 * Pointer to the inode and byte range in the inode that this 97 * GC chunk is operating on. 98 */ 99 struct xfs_inode *ip; 100 loff_t offset; 101 unsigned int len; 102 103 /* 104 * Existing startblock (in the zone to be freed) and newly assigned 105 * daddr in the zone GCed into. 106 */ 107 xfs_fsblock_t old_startblock; 108 xfs_daddr_t new_daddr; 109 struct xfs_zone_scratch *scratch; 110 111 /* Are we writing to a sequential write required zone? */ 112 bool is_seq; 113 114 /* Open Zone being written to */ 115 struct xfs_open_zone *oz; 116 117 /* Bio used for reads and writes, including the bvec used by it */ 118 struct bio_vec bv; 119 struct bio bio; /* must be last */ 120 }; 121 122 #define XFS_ZONE_GC_RECS 1024 123 124 /* iterator, needs to be reinitialized for each victim zone */ 125 struct xfs_zone_gc_iter { 126 struct xfs_rtgroup *victim_rtg; 127 unsigned int rec_count; 128 unsigned int rec_idx; 129 xfs_agblock_t next_startblock; 130 struct xfs_rmap_irec *recs; 131 }; 132 133 /* 134 * Per-mount GC state. 135 */ 136 struct xfs_zone_gc_data { 137 struct xfs_mount *mp; 138 139 /* bioset used to allocate the gc_bios */ 140 struct bio_set bio_set; 141 142 /* 143 * Scratchpad used, and index to indicated which one is used. 144 */ 145 struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH]; 146 unsigned int scratch_idx; 147 148 /* 149 * List of bios currently being read, written and reset. 150 * These lists are only accessed by the GC thread itself, and must only 151 * be processed in order. 152 */ 153 struct list_head reading; 154 struct list_head writing; 155 struct list_head resetting; 156 157 /* 158 * Iterator for the victim zone. 159 */ 160 struct xfs_zone_gc_iter iter; 161 }; 162 163 /* 164 * We aim to keep enough zones free in stock to fully use the open zone limit 165 * for data placement purposes. Additionally, the m_zonegc_low_space tunable 166 * can be set to make sure a fraction of the unused blocks are available for 167 * writing. 168 */ 169 bool 170 xfs_zoned_need_gc( 171 struct xfs_mount *mp) 172 { 173 s64 available, free; 174 175 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) 176 return false; 177 178 available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); 179 180 if (available < 181 mp->m_groups[XG_TYPE_RTG].blocks * 182 (mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) 183 return true; 184 185 free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); 186 if (available < mult_frac(free, mp->m_zonegc_low_space, 100)) 187 return true; 188 189 return false; 190 } 191 192 static struct xfs_zone_gc_data * 193 xfs_zone_gc_data_alloc( 194 struct xfs_mount *mp) 195 { 196 struct xfs_zone_gc_data *data; 197 int i; 198 199 data = kzalloc(sizeof(*data), GFP_KERNEL); 200 if (!data) 201 return NULL; 202 data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs), 203 GFP_KERNEL); 204 if (!data->iter.recs) 205 goto out_free_data; 206 207 /* 208 * We actually only need a single bio_vec. It would be nice to have 209 * a flag that only allocates the inline bvecs and not the separate 210 * bvec pool. 211 */ 212 if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio), 213 BIOSET_NEED_BVECS)) 214 goto out_free_recs; 215 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) { 216 data->scratch[i].folio = 217 folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE)); 218 if (!data->scratch[i].folio) 219 goto out_free_scratch; 220 } 221 INIT_LIST_HEAD(&data->reading); 222 INIT_LIST_HEAD(&data->writing); 223 INIT_LIST_HEAD(&data->resetting); 224 data->mp = mp; 225 return data; 226 227 out_free_scratch: 228 while (--i >= 0) 229 folio_put(data->scratch[i].folio); 230 bioset_exit(&data->bio_set); 231 out_free_recs: 232 kfree(data->iter.recs); 233 out_free_data: 234 kfree(data); 235 return NULL; 236 } 237 238 static void 239 xfs_zone_gc_data_free( 240 struct xfs_zone_gc_data *data) 241 { 242 int i; 243 244 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) 245 folio_put(data->scratch[i].folio); 246 bioset_exit(&data->bio_set); 247 kfree(data->iter.recs); 248 kfree(data); 249 } 250 251 static void 252 xfs_zone_gc_iter_init( 253 struct xfs_zone_gc_iter *iter, 254 struct xfs_rtgroup *victim_rtg) 255 256 { 257 iter->next_startblock = 0; 258 iter->rec_count = 0; 259 iter->rec_idx = 0; 260 iter->victim_rtg = victim_rtg; 261 } 262 263 /* 264 * Query the rmap of the victim zone to gather the records to evacuate. 265 */ 266 static int 267 xfs_zone_gc_query_cb( 268 struct xfs_btree_cur *cur, 269 const struct xfs_rmap_irec *irec, 270 void *private) 271 { 272 struct xfs_zone_gc_iter *iter = private; 273 274 ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)); 275 ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner)); 276 ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))); 277 278 iter->recs[iter->rec_count] = *irec; 279 if (++iter->rec_count == XFS_ZONE_GC_RECS) { 280 iter->next_startblock = 281 irec->rm_startblock + irec->rm_blockcount; 282 return 1; 283 } 284 return 0; 285 } 286 287 #define cmp_int(l, r) ((l > r) - (l < r)) 288 289 static int 290 xfs_zone_gc_rmap_rec_cmp( 291 const void *a, 292 const void *b) 293 { 294 const struct xfs_rmap_irec *reca = a; 295 const struct xfs_rmap_irec *recb = b; 296 int diff; 297 298 diff = cmp_int(reca->rm_owner, recb->rm_owner); 299 if (diff) 300 return diff; 301 return cmp_int(reca->rm_offset, recb->rm_offset); 302 } 303 304 static int 305 xfs_zone_gc_query( 306 struct xfs_mount *mp, 307 struct xfs_zone_gc_iter *iter) 308 { 309 struct xfs_rtgroup *rtg = iter->victim_rtg; 310 struct xfs_rmap_irec ri_low = { }; 311 struct xfs_rmap_irec ri_high; 312 struct xfs_btree_cur *cur; 313 struct xfs_trans *tp; 314 int error; 315 316 ASSERT(iter->next_startblock <= rtg_blocks(rtg)); 317 if (iter->next_startblock == rtg_blocks(rtg)) 318 goto done; 319 320 ASSERT(iter->next_startblock < rtg_blocks(rtg)); 321 ri_low.rm_startblock = iter->next_startblock; 322 memset(&ri_high, 0xFF, sizeof(ri_high)); 323 324 iter->rec_idx = 0; 325 iter->rec_count = 0; 326 327 error = xfs_trans_alloc_empty(mp, &tp); 328 if (error) 329 return error; 330 331 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 332 cur = xfs_rtrmapbt_init_cursor(tp, rtg); 333 error = xfs_rmap_query_range(cur, &ri_low, &ri_high, 334 xfs_zone_gc_query_cb, iter); 335 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 336 xfs_btree_del_cursor(cur, error < 0 ? error : 0); 337 xfs_trans_cancel(tp); 338 339 if (error < 0) 340 return error; 341 342 /* 343 * Sort the rmap records by inode number and increasing offset to 344 * defragment the mappings. 345 * 346 * This could be further enhanced by an even bigger look ahead window, 347 * but that's better left until we have better detection of changes to 348 * inode mapping to avoid the potential of GCing already dead data. 349 */ 350 sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]), 351 xfs_zone_gc_rmap_rec_cmp, NULL); 352 353 if (error == 0) { 354 /* 355 * We finished iterating through the zone. 356 */ 357 iter->next_startblock = rtg_blocks(rtg); 358 if (iter->rec_count == 0) 359 goto done; 360 } 361 362 return 0; 363 done: 364 xfs_rtgroup_rele(iter->victim_rtg); 365 iter->victim_rtg = NULL; 366 return 0; 367 } 368 369 static bool 370 xfs_zone_gc_iter_next( 371 struct xfs_mount *mp, 372 struct xfs_zone_gc_iter *iter, 373 struct xfs_rmap_irec *chunk_rec, 374 struct xfs_inode **ipp) 375 { 376 struct xfs_rmap_irec *irec; 377 int error; 378 379 if (!iter->victim_rtg) 380 return false; 381 382 retry: 383 if (iter->rec_idx == iter->rec_count) { 384 error = xfs_zone_gc_query(mp, iter); 385 if (error) 386 goto fail; 387 if (!iter->victim_rtg) 388 return false; 389 } 390 391 irec = &iter->recs[iter->rec_idx]; 392 error = xfs_iget(mp, NULL, irec->rm_owner, 393 XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp); 394 if (error) { 395 /* 396 * If the inode was already deleted, skip over it. 397 */ 398 if (error == -ENOENT) { 399 iter->rec_idx++; 400 goto retry; 401 } 402 goto fail; 403 } 404 405 if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) { 406 iter->rec_idx++; 407 xfs_irele(*ipp); 408 goto retry; 409 } 410 411 *chunk_rec = *irec; 412 return true; 413 414 fail: 415 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 416 return false; 417 } 418 419 static void 420 xfs_zone_gc_iter_advance( 421 struct xfs_zone_gc_iter *iter, 422 xfs_extlen_t count_fsb) 423 { 424 struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx]; 425 426 irec->rm_offset += count_fsb; 427 irec->rm_startblock += count_fsb; 428 irec->rm_blockcount -= count_fsb; 429 if (!irec->rm_blockcount) 430 iter->rec_idx++; 431 } 432 433 static struct xfs_rtgroup * 434 xfs_zone_gc_pick_victim_from( 435 struct xfs_mount *mp, 436 uint32_t bucket) 437 { 438 struct xfs_zone_info *zi = mp->m_zone_info; 439 uint32_t victim_used = U32_MAX; 440 struct xfs_rtgroup *victim_rtg = NULL; 441 uint32_t bit; 442 443 if (!zi->zi_used_bucket_entries[bucket]) 444 return NULL; 445 446 for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket], 447 mp->m_sb.sb_rgcount) { 448 struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit); 449 450 if (!rtg) 451 continue; 452 453 /* skip zones that are just waiting for a reset */ 454 if (rtg_rmap(rtg)->i_used_blocks == 0 || 455 rtg_rmap(rtg)->i_used_blocks >= victim_used) { 456 xfs_rtgroup_rele(rtg); 457 continue; 458 } 459 460 if (victim_rtg) 461 xfs_rtgroup_rele(victim_rtg); 462 victim_rtg = rtg; 463 victim_used = rtg_rmap(rtg)->i_used_blocks; 464 465 /* 466 * Any zone that is less than 1 percent used is fair game for 467 * instant reclaim. All of these zones are in the last 468 * bucket, so avoid the expensive division for the zones 469 * in the other buckets. 470 */ 471 if (bucket == 0 && 472 rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100) 473 break; 474 } 475 476 return victim_rtg; 477 } 478 479 /* 480 * Iterate through all zones marked as reclaimable and find a candidate to 481 * reclaim. 482 */ 483 static bool 484 xfs_zone_gc_select_victim( 485 struct xfs_zone_gc_data *data) 486 { 487 struct xfs_zone_gc_iter *iter = &data->iter; 488 struct xfs_mount *mp = data->mp; 489 struct xfs_zone_info *zi = mp->m_zone_info; 490 struct xfs_rtgroup *victim_rtg = NULL; 491 unsigned int bucket; 492 493 if (xfs_is_shutdown(mp)) 494 return false; 495 496 if (iter->victim_rtg) 497 return true; 498 499 /* 500 * Don't start new work if we are asked to stop or park. 501 */ 502 if (kthread_should_stop() || kthread_should_park()) 503 return false; 504 505 if (!xfs_zoned_need_gc(mp)) 506 return false; 507 508 spin_lock(&zi->zi_used_buckets_lock); 509 for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { 510 victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); 511 if (victim_rtg) 512 break; 513 } 514 spin_unlock(&zi->zi_used_buckets_lock); 515 516 if (!victim_rtg) 517 return false; 518 519 trace_xfs_zone_gc_select_victim(victim_rtg, bucket); 520 xfs_zone_gc_iter_init(iter, victim_rtg); 521 return true; 522 } 523 524 static struct xfs_open_zone * 525 xfs_zone_gc_steal_open( 526 struct xfs_zone_info *zi) 527 { 528 struct xfs_open_zone *oz, *found = NULL; 529 530 spin_lock(&zi->zi_open_zones_lock); 531 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { 532 if (!found || 533 oz->oz_write_pointer < found->oz_write_pointer) 534 found = oz; 535 } 536 537 if (found) { 538 found->oz_is_gc = true; 539 list_del_init(&found->oz_entry); 540 zi->zi_nr_open_zones--; 541 } 542 543 spin_unlock(&zi->zi_open_zones_lock); 544 return found; 545 } 546 547 static struct xfs_open_zone * 548 xfs_zone_gc_select_target( 549 struct xfs_mount *mp) 550 { 551 struct xfs_zone_info *zi = mp->m_zone_info; 552 struct xfs_open_zone *oz = zi->zi_open_gc_zone; 553 554 /* 555 * We need to wait for pending writes to finish. 556 */ 557 if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg)) 558 return NULL; 559 560 ASSERT(zi->zi_nr_open_zones <= 561 mp->m_max_open_zones - XFS_OPEN_GC_ZONES); 562 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 563 if (oz) 564 trace_xfs_zone_gc_target_opened(oz->oz_rtg); 565 spin_lock(&zi->zi_open_zones_lock); 566 zi->zi_open_gc_zone = oz; 567 spin_unlock(&zi->zi_open_zones_lock); 568 return oz; 569 } 570 571 /* 572 * Ensure we have a valid open zone to write the GC data to. 573 * 574 * If the current target zone has space keep writing to it, else first wait for 575 * all pending writes and then pick a new one. 576 */ 577 static struct xfs_open_zone * 578 xfs_zone_gc_ensure_target( 579 struct xfs_mount *mp) 580 { 581 struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; 582 583 if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) 584 return xfs_zone_gc_select_target(mp); 585 return oz; 586 } 587 588 static unsigned int 589 xfs_zone_gc_scratch_available( 590 struct xfs_zone_gc_data *data) 591 { 592 return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset; 593 } 594 595 static bool 596 xfs_zone_gc_space_available( 597 struct xfs_zone_gc_data *data) 598 { 599 struct xfs_open_zone *oz; 600 601 oz = xfs_zone_gc_ensure_target(data->mp); 602 if (!oz) 603 return false; 604 return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) && 605 xfs_zone_gc_scratch_available(data); 606 } 607 608 static void 609 xfs_zone_gc_end_io( 610 struct bio *bio) 611 { 612 struct xfs_gc_bio *chunk = 613 container_of(bio, struct xfs_gc_bio, bio); 614 struct xfs_zone_gc_data *data = chunk->data; 615 616 WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE); 617 wake_up_process(data->mp->m_zone_info->zi_gc_thread); 618 } 619 620 static struct xfs_open_zone * 621 xfs_zone_gc_alloc_blocks( 622 struct xfs_zone_gc_data *data, 623 xfs_extlen_t *count_fsb, 624 xfs_daddr_t *daddr, 625 bool *is_seq) 626 { 627 struct xfs_mount *mp = data->mp; 628 struct xfs_open_zone *oz; 629 630 oz = xfs_zone_gc_ensure_target(mp); 631 if (!oz) 632 return NULL; 633 634 *count_fsb = min(*count_fsb, 635 XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data))); 636 637 /* 638 * Directly allocate GC blocks from the reserved pool. 639 * 640 * If we'd take them from the normal pool we could be stealing blocks 641 * from a regular writer, which would then have to wait for GC and 642 * deadlock. 643 */ 644 spin_lock(&mp->m_sb_lock); 645 *count_fsb = min(*count_fsb, 646 rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer); 647 *count_fsb = min3(*count_fsb, 648 mp->m_free[XC_FREE_RTEXTENTS].res_avail, 649 mp->m_free[XC_FREE_RTAVAILABLE].res_avail); 650 mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb; 651 mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb; 652 spin_unlock(&mp->m_sb_lock); 653 654 if (!*count_fsb) 655 return NULL; 656 657 *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0); 658 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); 659 if (!*is_seq) 660 *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer); 661 oz->oz_write_pointer += *count_fsb; 662 atomic_inc(&oz->oz_ref); 663 return oz; 664 } 665 666 static bool 667 xfs_zone_gc_start_chunk( 668 struct xfs_zone_gc_data *data) 669 { 670 struct xfs_zone_gc_iter *iter = &data->iter; 671 struct xfs_mount *mp = data->mp; 672 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 673 struct xfs_open_zone *oz; 674 struct xfs_rmap_irec irec; 675 struct xfs_gc_bio *chunk; 676 struct xfs_inode *ip; 677 struct bio *bio; 678 xfs_daddr_t daddr; 679 bool is_seq; 680 681 if (xfs_is_shutdown(mp)) 682 return false; 683 684 if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip)) 685 return false; 686 oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, 687 &is_seq); 688 if (!oz) { 689 xfs_irele(ip); 690 return false; 691 } 692 693 bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set); 694 695 chunk = container_of(bio, struct xfs_gc_bio, bio); 696 chunk->ip = ip; 697 chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); 698 chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount); 699 chunk->old_startblock = 700 xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); 701 chunk->new_daddr = daddr; 702 chunk->is_seq = is_seq; 703 chunk->scratch = &data->scratch[data->scratch_idx]; 704 chunk->data = data; 705 chunk->oz = oz; 706 707 bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); 708 bio->bi_end_io = xfs_zone_gc_end_io; 709 bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len, 710 chunk->scratch->offset); 711 chunk->scratch->offset += chunk->len; 712 if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) { 713 data->scratch_idx = 714 (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH; 715 } 716 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 717 list_add_tail(&chunk->entry, &data->reading); 718 xfs_zone_gc_iter_advance(iter, irec.rm_blockcount); 719 720 submit_bio(bio); 721 return true; 722 } 723 724 static void 725 xfs_zone_gc_free_chunk( 726 struct xfs_gc_bio *chunk) 727 { 728 list_del(&chunk->entry); 729 xfs_open_zone_put(chunk->oz); 730 xfs_irele(chunk->ip); 731 bio_put(&chunk->bio); 732 } 733 734 static void 735 xfs_zone_gc_submit_write( 736 struct xfs_zone_gc_data *data, 737 struct xfs_gc_bio *chunk) 738 { 739 if (chunk->is_seq) { 740 chunk->bio.bi_opf &= ~REQ_OP_WRITE; 741 chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND; 742 } 743 chunk->bio.bi_iter.bi_sector = chunk->new_daddr; 744 chunk->bio.bi_end_io = xfs_zone_gc_end_io; 745 submit_bio(&chunk->bio); 746 } 747 748 static struct xfs_gc_bio * 749 xfs_zone_gc_split_write( 750 struct xfs_zone_gc_data *data, 751 struct xfs_gc_bio *chunk) 752 { 753 struct queue_limits *lim = 754 &bdev_get_queue(chunk->bio.bi_bdev)->limits; 755 struct xfs_gc_bio *split_chunk; 756 int split_sectors; 757 unsigned int split_len; 758 struct bio *split; 759 unsigned int nsegs; 760 761 if (!chunk->is_seq) 762 return NULL; 763 764 split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs, 765 lim->max_zone_append_sectors << SECTOR_SHIFT); 766 if (!split_sectors) 767 return NULL; 768 769 /* ensure the split chunk is still block size aligned */ 770 split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT, 771 data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT; 772 split_len = split_sectors << SECTOR_SHIFT; 773 774 split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set); 775 split_chunk = container_of(split, struct xfs_gc_bio, bio); 776 split_chunk->data = data; 777 ihold(VFS_I(chunk->ip)); 778 split_chunk->ip = chunk->ip; 779 split_chunk->is_seq = chunk->is_seq; 780 split_chunk->scratch = chunk->scratch; 781 split_chunk->offset = chunk->offset; 782 split_chunk->len = split_len; 783 split_chunk->old_startblock = chunk->old_startblock; 784 split_chunk->new_daddr = chunk->new_daddr; 785 split_chunk->oz = chunk->oz; 786 atomic_inc(&chunk->oz->oz_ref); 787 788 chunk->offset += split_len; 789 chunk->len -= split_len; 790 chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); 791 792 /* add right before the original chunk */ 793 WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW); 794 list_add_tail(&split_chunk->entry, &chunk->entry); 795 return split_chunk; 796 } 797 798 static void 799 xfs_zone_gc_write_chunk( 800 struct xfs_gc_bio *chunk) 801 { 802 struct xfs_zone_gc_data *data = chunk->data; 803 struct xfs_mount *mp = chunk->ip->i_mount; 804 unsigned int folio_offset = chunk->bio.bi_io_vec->bv_offset; 805 struct xfs_gc_bio *split_chunk; 806 807 if (chunk->bio.bi_status) 808 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 809 if (xfs_is_shutdown(mp)) { 810 xfs_zone_gc_free_chunk(chunk); 811 return; 812 } 813 814 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 815 list_move_tail(&chunk->entry, &data->writing); 816 817 bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE); 818 bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len, 819 folio_offset); 820 821 while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) 822 xfs_zone_gc_submit_write(data, split_chunk); 823 xfs_zone_gc_submit_write(data, chunk); 824 } 825 826 static void 827 xfs_zone_gc_finish_chunk( 828 struct xfs_gc_bio *chunk) 829 { 830 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 831 struct xfs_inode *ip = chunk->ip; 832 struct xfs_mount *mp = ip->i_mount; 833 int error; 834 835 if (chunk->bio.bi_status) 836 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 837 if (xfs_is_shutdown(mp)) { 838 xfs_zone_gc_free_chunk(chunk); 839 return; 840 } 841 842 chunk->scratch->freed += chunk->len; 843 if (chunk->scratch->freed == chunk->scratch->offset) { 844 chunk->scratch->offset = 0; 845 chunk->scratch->freed = 0; 846 } 847 848 /* 849 * Cycle through the iolock and wait for direct I/O and layouts to 850 * ensure no one is reading from the old mapping before it goes away. 851 * 852 * Note that xfs_zoned_end_io() below checks that no other writer raced 853 * with us to update the mapping by checking that the old startblock 854 * didn't change. 855 */ 856 xfs_ilock(ip, iolock); 857 error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP); 858 if (!error) 859 inode_dio_wait(VFS_I(ip)); 860 xfs_iunlock(ip, iolock); 861 if (error) 862 goto free; 863 864 if (chunk->is_seq) 865 chunk->new_daddr = chunk->bio.bi_iter.bi_sector; 866 error = xfs_zoned_end_io(ip, chunk->offset, chunk->len, 867 chunk->new_daddr, chunk->oz, chunk->old_startblock); 868 free: 869 if (error) 870 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 871 xfs_zone_gc_free_chunk(chunk); 872 } 873 874 static void 875 xfs_zone_gc_finish_reset( 876 struct xfs_gc_bio *chunk) 877 { 878 struct xfs_rtgroup *rtg = chunk->bio.bi_private; 879 struct xfs_mount *mp = rtg_mount(rtg); 880 struct xfs_zone_info *zi = mp->m_zone_info; 881 882 if (chunk->bio.bi_status) { 883 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 884 goto out; 885 } 886 887 xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); 888 atomic_inc(&zi->zi_nr_free_zones); 889 890 xfs_zoned_add_available(mp, rtg_blocks(rtg)); 891 892 wake_up_all(&zi->zi_zone_wait); 893 out: 894 list_del(&chunk->entry); 895 bio_put(&chunk->bio); 896 } 897 898 static bool 899 xfs_zone_gc_prepare_reset( 900 struct bio *bio, 901 struct xfs_rtgroup *rtg) 902 { 903 trace_xfs_zone_reset(rtg); 904 905 ASSERT(rtg_rmap(rtg)->i_used_blocks == 0); 906 bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); 907 if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { 908 if (!bdev_max_discard_sectors(bio->bi_bdev)) 909 return false; 910 bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC; 911 bio->bi_iter.bi_size = 912 XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg)); 913 } 914 915 return true; 916 } 917 918 int 919 xfs_zone_gc_reset_sync( 920 struct xfs_rtgroup *rtg) 921 { 922 int error = 0; 923 struct bio bio; 924 925 bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, 926 REQ_OP_ZONE_RESET); 927 if (xfs_zone_gc_prepare_reset(&bio, rtg)) 928 error = submit_bio_wait(&bio); 929 bio_uninit(&bio); 930 931 return error; 932 } 933 934 static void 935 xfs_zone_gc_reset_zones( 936 struct xfs_zone_gc_data *data, 937 struct xfs_group *reset_list) 938 { 939 struct xfs_group *next = reset_list; 940 941 if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) { 942 xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR); 943 return; 944 } 945 946 do { 947 struct xfs_rtgroup *rtg = to_rtg(next); 948 struct xfs_gc_bio *chunk; 949 struct bio *bio; 950 951 xfs_log_force_inode(rtg_rmap(rtg)); 952 953 next = rtg_group(rtg)->xg_next_reset; 954 rtg_group(rtg)->xg_next_reset = NULL; 955 956 bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, 957 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set); 958 bio->bi_private = rtg; 959 bio->bi_end_io = xfs_zone_gc_end_io; 960 961 chunk = container_of(bio, struct xfs_gc_bio, bio); 962 chunk->data = data; 963 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 964 list_add_tail(&chunk->entry, &data->resetting); 965 966 /* 967 * Also use the bio to drive the state machine when neither 968 * zone reset nor discard is supported to keep things simple. 969 */ 970 if (xfs_zone_gc_prepare_reset(bio, rtg)) 971 submit_bio(bio); 972 else 973 bio_endio(bio); 974 } while (next); 975 } 976 977 /* 978 * Handle the work to read and write data for GC and to reset the zones, 979 * including handling all completions. 980 * 981 * Note that the order of the chunks is preserved so that we don't undo the 982 * optimal order established by xfs_zone_gc_query(). 983 */ 984 static bool 985 xfs_zone_gc_handle_work( 986 struct xfs_zone_gc_data *data) 987 { 988 struct xfs_zone_info *zi = data->mp->m_zone_info; 989 struct xfs_gc_bio *chunk, *next; 990 struct xfs_group *reset_list; 991 struct blk_plug plug; 992 993 spin_lock(&zi->zi_reset_list_lock); 994 reset_list = zi->zi_reset_list; 995 zi->zi_reset_list = NULL; 996 spin_unlock(&zi->zi_reset_list_lock); 997 998 if (!xfs_zone_gc_select_victim(data) || 999 !xfs_zone_gc_space_available(data)) { 1000 if (list_empty(&data->reading) && 1001 list_empty(&data->writing) && 1002 list_empty(&data->resetting) && 1003 !reset_list) 1004 return false; 1005 } 1006 1007 __set_current_state(TASK_RUNNING); 1008 try_to_freeze(); 1009 1010 if (reset_list) 1011 xfs_zone_gc_reset_zones(data, reset_list); 1012 1013 list_for_each_entry_safe(chunk, next, &data->resetting, entry) { 1014 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1015 break; 1016 xfs_zone_gc_finish_reset(chunk); 1017 } 1018 1019 list_for_each_entry_safe(chunk, next, &data->writing, entry) { 1020 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1021 break; 1022 xfs_zone_gc_finish_chunk(chunk); 1023 } 1024 1025 blk_start_plug(&plug); 1026 list_for_each_entry_safe(chunk, next, &data->reading, entry) { 1027 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1028 break; 1029 xfs_zone_gc_write_chunk(chunk); 1030 } 1031 blk_finish_plug(&plug); 1032 1033 blk_start_plug(&plug); 1034 while (xfs_zone_gc_start_chunk(data)) 1035 ; 1036 blk_finish_plug(&plug); 1037 return true; 1038 } 1039 1040 /* 1041 * Note that the current GC algorithm would break reflinks and thus duplicate 1042 * data that was shared by multiple owners before. Because of that reflinks 1043 * are currently not supported on zoned file systems and can't be created or 1044 * mounted. 1045 */ 1046 static int 1047 xfs_zoned_gcd( 1048 void *private) 1049 { 1050 struct xfs_zone_gc_data *data = private; 1051 struct xfs_mount *mp = data->mp; 1052 struct xfs_zone_info *zi = mp->m_zone_info; 1053 unsigned int nofs_flag; 1054 1055 nofs_flag = memalloc_nofs_save(); 1056 set_freezable(); 1057 1058 for (;;) { 1059 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); 1060 xfs_set_zonegc_running(mp); 1061 if (xfs_zone_gc_handle_work(data)) 1062 continue; 1063 1064 if (list_empty(&data->reading) && 1065 list_empty(&data->writing) && 1066 list_empty(&data->resetting) && 1067 !zi->zi_reset_list) { 1068 xfs_clear_zonegc_running(mp); 1069 xfs_zoned_resv_wake_all(mp); 1070 1071 if (kthread_should_stop()) { 1072 __set_current_state(TASK_RUNNING); 1073 break; 1074 } 1075 1076 if (kthread_should_park()) { 1077 __set_current_state(TASK_RUNNING); 1078 kthread_parkme(); 1079 continue; 1080 } 1081 } 1082 1083 schedule(); 1084 } 1085 xfs_clear_zonegc_running(mp); 1086 1087 if (data->iter.victim_rtg) 1088 xfs_rtgroup_rele(data->iter.victim_rtg); 1089 1090 memalloc_nofs_restore(nofs_flag); 1091 xfs_zone_gc_data_free(data); 1092 return 0; 1093 } 1094 1095 void 1096 xfs_zone_gc_start( 1097 struct xfs_mount *mp) 1098 { 1099 if (xfs_has_zoned(mp)) 1100 kthread_unpark(mp->m_zone_info->zi_gc_thread); 1101 } 1102 1103 void 1104 xfs_zone_gc_stop( 1105 struct xfs_mount *mp) 1106 { 1107 if (xfs_has_zoned(mp)) 1108 kthread_park(mp->m_zone_info->zi_gc_thread); 1109 } 1110 1111 int 1112 xfs_zone_gc_mount( 1113 struct xfs_mount *mp) 1114 { 1115 struct xfs_zone_info *zi = mp->m_zone_info; 1116 struct xfs_zone_gc_data *data; 1117 struct xfs_open_zone *oz; 1118 int error; 1119 1120 /* 1121 * If there are no free zones available for GC, pick the open zone with 1122 * the least used space to GC into. This should only happen after an 1123 * unclean shutdown near ENOSPC while GC was ongoing. 1124 * 1125 * We also need to do this for the first gc zone allocation if we 1126 * unmounted while at the open limit. 1127 */ 1128 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || 1129 zi->zi_nr_open_zones == mp->m_max_open_zones) 1130 oz = xfs_zone_gc_steal_open(zi); 1131 else 1132 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); 1133 if (!oz) { 1134 xfs_warn(mp, "unable to allocate a zone for gc"); 1135 error = -EIO; 1136 goto out; 1137 } 1138 1139 trace_xfs_zone_gc_target_opened(oz->oz_rtg); 1140 zi->zi_open_gc_zone = oz; 1141 1142 data = xfs_zone_gc_data_alloc(mp); 1143 if (!data) { 1144 error = -ENOMEM; 1145 goto out_put_gc_zone; 1146 } 1147 1148 mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, 1149 "xfs-zone-gc/%s", mp->m_super->s_id); 1150 if (IS_ERR(mp->m_zone_info->zi_gc_thread)) { 1151 xfs_warn(mp, "unable to create zone gc thread"); 1152 error = PTR_ERR(mp->m_zone_info->zi_gc_thread); 1153 goto out_free_gc_data; 1154 } 1155 1156 /* xfs_zone_gc_start will unpark for rw mounts */ 1157 kthread_park(mp->m_zone_info->zi_gc_thread); 1158 return 0; 1159 1160 out_free_gc_data: 1161 kfree(data); 1162 out_put_gc_zone: 1163 xfs_open_zone_put(zi->zi_open_gc_zone); 1164 out: 1165 return error; 1166 } 1167 1168 void 1169 xfs_zone_gc_unmount( 1170 struct xfs_mount *mp) 1171 { 1172 struct xfs_zone_info *zi = mp->m_zone_info; 1173 1174 kthread_stop(zi->zi_gc_thread); 1175 if (zi->zi_open_gc_zone) 1176 xfs_open_zone_put(zi->zi_open_gc_zone); 1177 } 1178