1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2023-2025 Christoph Hellwig. 4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. 5 */ 6 #include "xfs_platform.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_trans_resv.h" 11 #include "xfs_mount.h" 12 #include "xfs_inode.h" 13 #include "xfs_btree.h" 14 #include "xfs_trans.h" 15 #include "xfs_icache.h" 16 #include "xfs_rmap.h" 17 #include "xfs_rtbitmap.h" 18 #include "xfs_rtrmap_btree.h" 19 #include "xfs_errortag.h" 20 #include "xfs_error.h" 21 #include "xfs_zone_alloc.h" 22 #include "xfs_zone_priv.h" 23 #include "xfs_zones.h" 24 #include "xfs_trace.h" 25 26 /* 27 * Implement Garbage Collection (GC) of partially used zoned. 28 * 29 * To support the purely sequential writes in each zone, zoned XFS needs to be 30 * able to move data remaining in a zone out of it to reset the zone to prepare 31 * for writing to it again. 32 * 33 * This is done by the GC thread implemented in this file. To support that a 34 * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to 35 * write the garbage collected data into. 36 * 37 * Whenever the available space is below the chosen threshold, the GC thread 38 * looks for potential non-empty but not fully used zones that are worth 39 * reclaiming. Once found the rmap for the victim zone is queried, and after 40 * a bit of sorting to reduce fragmentation, the still live extents are read 41 * into memory and written to the GC target zone, and the bmap btree of the 42 * files is updated to point to the new location. To avoid taking the IOLOCK 43 * and MMAPLOCK for the entire GC process and thus affecting the latency of 44 * user reads and writes to the files, the GC writes are speculative and the 45 * I/O completion checks that no other writes happened for the affected regions 46 * before remapping. 47 * 48 * Once a zone does not contain any valid data, be that through GC or user 49 * block removal, it is queued for for a zone reset. The reset operation 50 * carefully ensures that the RT device cache is flushed and all transactions 51 * referencing the rmap have been committed to disk. 52 */ 53 54 /* 55 * Size of each GC scratch allocation, and the number of buffers. 56 */ 57 #define XFS_GC_BUF_SIZE SZ_1M 58 #define XFS_GC_NR_BUFS 2 59 static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS); 60 61 /* 62 * Chunk that is read and written for each GC operation. 63 * 64 * Note that for writes to actual zoned devices, the chunk can be split when 65 * reaching the hardware limit. 66 */ 67 struct xfs_gc_bio { 68 struct xfs_zone_gc_data *data; 69 70 /* 71 * Entry into the reading/writing/resetting list. Only accessed from 72 * the GC thread, so no locking needed. 73 */ 74 struct list_head entry; 75 76 /* 77 * State of this gc_bio. Done means the current I/O completed. 78 * Set from the bio end I/O handler, read from the GC thread. 79 */ 80 enum { 81 XFS_GC_BIO_NEW, 82 XFS_GC_BIO_DONE, 83 } state; 84 85 /* 86 * Pointer to the inode and byte range in the inode that this 87 * GC chunk is operating on. 88 */ 89 struct xfs_inode *ip; 90 loff_t offset; 91 unsigned int len; 92 93 /* 94 * Existing startblock (in the zone to be freed) and newly assigned 95 * daddr in the zone GCed into. 96 */ 97 xfs_fsblock_t old_startblock; 98 xfs_daddr_t new_daddr; 99 100 /* Are we writing to a sequential write required zone? */ 101 bool is_seq; 102 103 /* Open Zone being written to */ 104 struct xfs_open_zone *oz; 105 106 struct xfs_rtgroup *victim_rtg; 107 108 /* Bio used for reads and writes, including the bvec used by it */ 109 struct bio bio; /* must be last */ 110 }; 111 112 #define XFS_ZONE_GC_RECS 1024 113 114 /* iterator, needs to be reinitialized for each victim zone */ 115 struct xfs_zone_gc_iter { 116 struct xfs_rtgroup *victim_rtg; 117 unsigned int rec_count; 118 unsigned int rec_idx; 119 xfs_agblock_t next_startblock; 120 struct xfs_rmap_irec *recs; 121 }; 122 123 /* 124 * Per-mount GC state. 125 */ 126 struct xfs_zone_gc_data { 127 struct xfs_mount *mp; 128 struct xfs_open_zone *oz; 129 130 /* bioset used to allocate the gc_bios */ 131 struct bio_set bio_set; 132 133 /* 134 * Scratchpad to buffer GC data, organized as a ring buffer over 135 * discontiguous folios. scratch_head is where the buffer is filled, 136 * scratch_tail tracks the buffer space freed, and scratch_available 137 * counts the space available in the ring buffer between the head and 138 * the tail. 139 */ 140 struct folio *scratch_folios[XFS_GC_NR_BUFS]; 141 unsigned int scratch_size; 142 unsigned int scratch_available; 143 unsigned int scratch_head; 144 unsigned int scratch_tail; 145 146 /* 147 * List of bios currently being read, written and reset. 148 * These lists are only accessed by the GC thread itself, and must only 149 * be processed in order. 150 */ 151 struct list_head reading; 152 struct list_head writing; 153 struct list_head resetting; 154 155 /* 156 * Iterator for the victim zone. 157 */ 158 struct xfs_zone_gc_iter iter; 159 }; 160 161 /* 162 * We aim to keep enough zones free in stock to fully use the open zone limit 163 * for data placement purposes. Additionally, the m_zonegc_low_space tunable 164 * can be set to make sure a fraction of the unused blocks are available for 165 * writing. 166 */ 167 bool 168 xfs_zoned_need_gc( 169 struct xfs_mount *mp) 170 { 171 s64 available, free, threshold; 172 s32 remainder; 173 174 /* If we have no reclaimable blocks, running GC is useless. */ 175 if (!xfs_zoned_have_reclaimable(mp->m_zone_info)) 176 return false; 177 178 /* 179 * In order to avoid file fragmentation as much as possible, we should 180 * make sure that we can open enough zones. So trigger GC if the number 181 * of blocks immediately available for writes is lower than the total 182 * number of blocks from all possible open zones. 183 */ 184 available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); 185 if (available < 186 xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) 187 return true; 188 189 /* 190 * For cases where the user wants to be more aggressive with GC, 191 * the sysfs attribute zonegc_low_space may be set to a non zero value, 192 * to indicate that GC should try to maintain at least zonegc_low_space 193 * percent of the free space to be directly available for writing. Check 194 * this here. 195 */ 196 if (!mp->m_zonegc_low_space) 197 return false; 198 199 free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); 200 threshold = div_s64_rem(free, 100, &remainder); 201 threshold = threshold * mp->m_zonegc_low_space + 202 remainder * div_s64(mp->m_zonegc_low_space, 100); 203 204 return available < threshold; 205 } 206 207 static struct xfs_zone_gc_data * 208 xfs_zone_gc_data_alloc( 209 struct xfs_mount *mp) 210 { 211 struct xfs_zone_gc_data *data; 212 int i; 213 214 data = kzalloc_obj(*data); 215 if (!data) 216 return NULL; 217 data->iter.recs = kzalloc_objs(*data->iter.recs, XFS_ZONE_GC_RECS); 218 if (!data->iter.recs) 219 goto out_free_data; 220 221 if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio), 222 BIOSET_NEED_BVECS)) 223 goto out_free_recs; 224 for (i = 0; i < XFS_GC_NR_BUFS; i++) { 225 data->scratch_folios[i] = 226 folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE)); 227 if (!data->scratch_folios[i]) 228 goto out_free_scratch; 229 } 230 data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS; 231 data->scratch_available = data->scratch_size; 232 INIT_LIST_HEAD(&data->reading); 233 INIT_LIST_HEAD(&data->writing); 234 INIT_LIST_HEAD(&data->resetting); 235 data->mp = mp; 236 return data; 237 238 out_free_scratch: 239 while (--i >= 0) 240 folio_put(data->scratch_folios[i]); 241 bioset_exit(&data->bio_set); 242 out_free_recs: 243 kfree(data->iter.recs); 244 out_free_data: 245 kfree(data); 246 return NULL; 247 } 248 249 static void 250 xfs_zone_gc_data_free( 251 struct xfs_zone_gc_data *data) 252 { 253 int i; 254 255 for (i = 0; i < XFS_GC_NR_BUFS; i++) 256 folio_put(data->scratch_folios[i]); 257 bioset_exit(&data->bio_set); 258 kfree(data->iter.recs); 259 kfree(data); 260 } 261 262 static void 263 xfs_zone_gc_iter_init( 264 struct xfs_zone_gc_iter *iter, 265 struct xfs_rtgroup *victim_rtg) 266 267 { 268 iter->next_startblock = 0; 269 iter->rec_count = 0; 270 iter->rec_idx = 0; 271 iter->victim_rtg = victim_rtg; 272 atomic_inc(&victim_rtg->rtg_gccount); 273 } 274 275 /* 276 * Query the rmap of the victim zone to gather the records to evacuate. 277 */ 278 static int 279 xfs_zone_gc_query_cb( 280 struct xfs_btree_cur *cur, 281 const struct xfs_rmap_irec *irec, 282 void *private) 283 { 284 struct xfs_zone_gc_iter *iter = private; 285 286 ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)); 287 ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner)); 288 ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))); 289 290 iter->recs[iter->rec_count] = *irec; 291 if (++iter->rec_count == XFS_ZONE_GC_RECS) { 292 iter->next_startblock = 293 irec->rm_startblock + irec->rm_blockcount; 294 return 1; 295 } 296 return 0; 297 } 298 299 static int 300 xfs_zone_gc_rmap_rec_cmp( 301 const void *a, 302 const void *b) 303 { 304 const struct xfs_rmap_irec *reca = a; 305 const struct xfs_rmap_irec *recb = b; 306 int diff; 307 308 diff = cmp_int(reca->rm_owner, recb->rm_owner); 309 if (diff) 310 return diff; 311 return cmp_int(reca->rm_offset, recb->rm_offset); 312 } 313 314 static int 315 xfs_zone_gc_query( 316 struct xfs_mount *mp, 317 struct xfs_zone_gc_iter *iter) 318 { 319 struct xfs_rtgroup *rtg = iter->victim_rtg; 320 struct xfs_rmap_irec ri_low = { }; 321 struct xfs_rmap_irec ri_high; 322 struct xfs_btree_cur *cur; 323 struct xfs_trans *tp; 324 int error; 325 326 ASSERT(iter->next_startblock <= rtg_blocks(rtg)); 327 if (iter->next_startblock == rtg_blocks(rtg)) 328 goto done; 329 330 ASSERT(iter->next_startblock < rtg_blocks(rtg)); 331 ri_low.rm_startblock = iter->next_startblock; 332 memset(&ri_high, 0xFF, sizeof(ri_high)); 333 334 iter->rec_idx = 0; 335 iter->rec_count = 0; 336 337 tp = xfs_trans_alloc_empty(mp); 338 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); 339 cur = xfs_rtrmapbt_init_cursor(tp, rtg); 340 error = xfs_rmap_query_range(cur, &ri_low, &ri_high, 341 xfs_zone_gc_query_cb, iter); 342 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); 343 xfs_btree_del_cursor(cur, error < 0 ? error : 0); 344 xfs_trans_cancel(tp); 345 346 if (error < 0) 347 return error; 348 349 /* 350 * Sort the rmap records by inode number and increasing offset to 351 * defragment the mappings. 352 * 353 * This could be further enhanced by an even bigger look ahead window, 354 * but that's better left until we have better detection of changes to 355 * inode mapping to avoid the potential of GCing already dead data. 356 */ 357 sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]), 358 xfs_zone_gc_rmap_rec_cmp, NULL); 359 360 if (error == 0) { 361 /* 362 * We finished iterating through the zone. 363 */ 364 iter->next_startblock = rtg_blocks(rtg); 365 if (iter->rec_count == 0) 366 goto done; 367 } 368 369 return 0; 370 done: 371 atomic_dec(&iter->victim_rtg->rtg_gccount); 372 xfs_rtgroup_rele(iter->victim_rtg); 373 iter->victim_rtg = NULL; 374 return 0; 375 } 376 377 static bool 378 xfs_zone_gc_iter_irec( 379 struct xfs_mount *mp, 380 struct xfs_zone_gc_iter *iter, 381 struct xfs_rmap_irec *chunk_rec, 382 struct xfs_inode **ipp) 383 { 384 struct xfs_rmap_irec *irec; 385 int error; 386 387 retry: 388 if (iter->rec_idx == iter->rec_count) { 389 error = xfs_zone_gc_query(mp, iter); 390 if (error) 391 goto fail; 392 if (!iter->victim_rtg) 393 return false; 394 } 395 396 irec = &iter->recs[iter->rec_idx]; 397 error = xfs_iget(mp, NULL, irec->rm_owner, 398 XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp); 399 if (error) { 400 /* 401 * If the inode was already deleted, skip over it. 402 */ 403 if (error == -ENOENT || error == -EINVAL) { 404 iter->rec_idx++; 405 goto retry; 406 } 407 goto fail; 408 } 409 410 if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) { 411 iter->rec_idx++; 412 xfs_irele(*ipp); 413 goto retry; 414 } 415 416 *chunk_rec = *irec; 417 return true; 418 419 fail: 420 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 421 return false; 422 } 423 424 static void 425 xfs_zone_gc_iter_advance( 426 struct xfs_zone_gc_iter *iter, 427 xfs_extlen_t count_fsb) 428 { 429 struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx]; 430 431 irec->rm_offset += count_fsb; 432 irec->rm_startblock += count_fsb; 433 irec->rm_blockcount -= count_fsb; 434 if (!irec->rm_blockcount) 435 iter->rec_idx++; 436 } 437 438 static struct xfs_rtgroup * 439 xfs_zone_gc_pick_victim_from( 440 struct xfs_mount *mp, 441 uint32_t bucket) 442 { 443 struct xfs_zone_info *zi = mp->m_zone_info; 444 uint32_t victim_used = U32_MAX; 445 struct xfs_rtgroup *victim_rtg = NULL; 446 uint32_t bit; 447 448 if (!zi->zi_used_bucket_entries[bucket]) 449 return NULL; 450 451 for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket], 452 mp->m_sb.sb_rgcount) { 453 struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit); 454 455 if (!rtg) 456 continue; 457 458 /* 459 * If the zone is already undergoing GC, don't pick it again. 460 * 461 * This prevents us from picking one of the zones for which we 462 * already submitted GC I/O, but for which the remapping hasn't 463 * concluded yet. This won't cause data corruption, but 464 * increases write amplification and slows down GC, so this is 465 * a bad thing. 466 */ 467 if (atomic_read(&rtg->rtg_gccount)) { 468 xfs_rtgroup_rele(rtg); 469 continue; 470 } 471 472 /* skip zones that are just waiting for a reset */ 473 if (rtg_rmap(rtg)->i_used_blocks == 0 || 474 rtg_rmap(rtg)->i_used_blocks >= victim_used) { 475 xfs_rtgroup_rele(rtg); 476 continue; 477 } 478 479 if (victim_rtg) 480 xfs_rtgroup_rele(victim_rtg); 481 victim_rtg = rtg; 482 victim_used = rtg_rmap(rtg)->i_used_blocks; 483 484 /* 485 * Any zone that is less than 1 percent used is fair game for 486 * instant reclaim. All of these zones are in the last 487 * bucket, so avoid the expensive division for the zones 488 * in the other buckets. 489 */ 490 if (bucket == 0 && 491 rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100) 492 break; 493 } 494 495 return victim_rtg; 496 } 497 498 /* 499 * Iterate through all zones marked as reclaimable and find a candidate to 500 * reclaim. 501 */ 502 static bool 503 xfs_zone_gc_select_victim( 504 struct xfs_zone_gc_data *data) 505 { 506 struct xfs_zone_gc_iter *iter = &data->iter; 507 struct xfs_mount *mp = data->mp; 508 struct xfs_zone_info *zi = mp->m_zone_info; 509 struct xfs_rtgroup *victim_rtg = NULL; 510 unsigned int bucket; 511 512 spin_lock(&zi->zi_used_buckets_lock); 513 for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { 514 victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); 515 if (victim_rtg) 516 break; 517 } 518 spin_unlock(&zi->zi_used_buckets_lock); 519 520 if (!victim_rtg) 521 return false; 522 523 trace_xfs_zone_gc_select_victim(victim_rtg, bucket); 524 xfs_zone_gc_iter_init(iter, victim_rtg); 525 return true; 526 } 527 528 static int 529 xfs_zone_gc_steal_open_zone( 530 struct xfs_zone_gc_data *data) 531 { 532 struct xfs_zone_info *zi = data->mp->m_zone_info; 533 struct xfs_open_zone *oz, *found = NULL; 534 535 spin_lock(&zi->zi_open_zones_lock); 536 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { 537 if (!found || oz->oz_allocated < found->oz_allocated) 538 found = oz; 539 } 540 if (!found) { 541 spin_unlock(&zi->zi_open_zones_lock); 542 return -EIO; 543 } 544 545 trace_xfs_zone_gc_target_stolen(found->oz_rtg); 546 found->oz_is_gc = true; 547 zi->zi_nr_open_zones--; 548 zi->zi_nr_open_gc_zones++; 549 spin_unlock(&zi->zi_open_zones_lock); 550 551 atomic_inc(&found->oz_ref); 552 data->oz = found; 553 return 0; 554 } 555 556 /* 557 * Ensure we have a valid open zone to write to. 558 */ 559 static bool 560 xfs_zone_gc_select_target( 561 struct xfs_zone_gc_data *data) 562 { 563 struct xfs_zone_info *zi = data->mp->m_zone_info; 564 565 if (data->oz) { 566 /* 567 * If we have space available, just keep using the existing 568 * zone. 569 */ 570 if (data->oz->oz_allocated < rtg_blocks(data->oz->oz_rtg)) 571 return true; 572 573 /* 574 * Wait for all writes to the current zone to finish before 575 * picking a new one. 576 */ 577 if (data->oz->oz_written < rtg_blocks(data->oz->oz_rtg)) 578 return false; 579 580 xfs_open_zone_put(data->oz); 581 } 582 583 /* 584 * Open a new zone when there is none currently in use. 585 */ 586 ASSERT(zi->zi_nr_open_zones <= 587 data->mp->m_max_open_zones - XFS_OPEN_GC_ZONES); 588 data->oz = xfs_open_zone(data->mp, WRITE_LIFE_NOT_SET, true); 589 if (!data->oz) 590 return false; 591 trace_xfs_zone_gc_target_opened(data->oz->oz_rtg); 592 atomic_inc(&data->oz->oz_ref); 593 spin_lock(&zi->zi_open_zones_lock); 594 zi->zi_nr_open_gc_zones++; 595 list_add_tail(&data->oz->oz_entry, &zi->zi_open_zones); 596 spin_unlock(&zi->zi_open_zones_lock); 597 return true; 598 } 599 600 static void 601 xfs_zone_gc_end_io( 602 struct bio *bio) 603 { 604 struct xfs_gc_bio *chunk = 605 container_of(bio, struct xfs_gc_bio, bio); 606 struct xfs_zone_gc_data *data = chunk->data; 607 608 WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE); 609 wake_up_process(data->mp->m_zone_info->zi_gc_thread); 610 } 611 612 static bool 613 xfs_zone_gc_alloc_blocks( 614 struct xfs_zone_gc_data *data, 615 xfs_extlen_t *count_fsb, 616 xfs_daddr_t *daddr, 617 bool *is_seq) 618 { 619 struct xfs_mount *mp = data->mp; 620 struct xfs_open_zone *oz = data->oz; 621 622 *count_fsb = min(*count_fsb, XFS_B_TO_FSB(mp, data->scratch_available)); 623 624 /* 625 * Directly allocate GC blocks from the reserved pool. 626 * 627 * If we'd take them from the normal pool we could be stealing blocks 628 * from a regular writer, which would then have to wait for GC and 629 * deadlock. 630 */ 631 spin_lock(&mp->m_sb_lock); 632 *count_fsb = min(*count_fsb, 633 rtg_blocks(oz->oz_rtg) - oz->oz_allocated); 634 *count_fsb = min3(*count_fsb, 635 mp->m_free[XC_FREE_RTEXTENTS].res_avail, 636 mp->m_free[XC_FREE_RTAVAILABLE].res_avail); 637 mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb; 638 mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb; 639 spin_unlock(&mp->m_sb_lock); 640 641 if (!*count_fsb) 642 return false; 643 644 *daddr = xfs_gbno_to_daddr(rtg_group(oz->oz_rtg), 0); 645 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); 646 if (!*is_seq) 647 *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated); 648 oz->oz_allocated += *count_fsb; 649 atomic_inc(&oz->oz_ref); 650 return true; 651 } 652 653 static void 654 xfs_zone_gc_add_data( 655 struct xfs_gc_bio *chunk) 656 { 657 struct xfs_zone_gc_data *data = chunk->data; 658 unsigned int len = chunk->len; 659 unsigned int off = data->scratch_head; 660 661 do { 662 unsigned int this_off = off % XFS_GC_BUF_SIZE; 663 unsigned int this_len = min(len, XFS_GC_BUF_SIZE - this_off); 664 665 bio_add_folio_nofail(&chunk->bio, 666 data->scratch_folios[off / XFS_GC_BUF_SIZE], 667 this_len, this_off); 668 len -= this_len; 669 off += this_len; 670 if (off == data->scratch_size) 671 off = 0; 672 } while (len); 673 } 674 675 static bool 676 xfs_zone_gc_can_start_chunk( 677 struct xfs_zone_gc_data *data) 678 { 679 680 if (xfs_is_shutdown(data->mp)) 681 return false; 682 if (!data->scratch_available) 683 return false; 684 685 if (!data->iter.victim_rtg) { 686 if (kthread_should_stop() || kthread_should_park()) 687 return false; 688 if (!xfs_zoned_need_gc(data->mp)) 689 return false; 690 if (!xfs_zone_gc_select_victim(data)) 691 return false; 692 } 693 694 return xfs_zone_gc_select_target(data); 695 } 696 697 static bool 698 xfs_zone_gc_start_chunk( 699 struct xfs_zone_gc_data *data) 700 { 701 struct xfs_zone_gc_iter *iter = &data->iter; 702 struct xfs_mount *mp = data->mp; 703 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; 704 struct xfs_rmap_irec irec; 705 struct xfs_gc_bio *chunk; 706 struct xfs_inode *ip; 707 struct bio *bio; 708 xfs_daddr_t daddr; 709 bool is_seq; 710 711 if (!xfs_zone_gc_can_start_chunk(data)) 712 return false; 713 714 set_current_state(TASK_RUNNING); 715 if (!xfs_zone_gc_iter_irec(mp, iter, &irec, &ip)) 716 return false; 717 718 if (!xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, 719 &is_seq)) { 720 xfs_irele(ip); 721 return false; 722 } 723 724 /* 725 * Scratch allocation can wrap around to the same buffer again, 726 * provision an extra bvec for that case. 727 */ 728 bio = bio_alloc_bioset(bdev, XFS_GC_NR_BUFS + 1, REQ_OP_READ, GFP_NOFS, 729 &data->bio_set); 730 chunk = container_of(bio, struct xfs_gc_bio, bio); 731 chunk->ip = ip; 732 chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); 733 chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount); 734 chunk->old_startblock = 735 xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); 736 chunk->new_daddr = daddr; 737 chunk->is_seq = is_seq; 738 chunk->data = data; 739 chunk->oz = data->oz; 740 chunk->victim_rtg = iter->victim_rtg; 741 atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref); 742 atomic_inc(&chunk->victim_rtg->rtg_gccount); 743 744 bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); 745 bio->bi_end_io = xfs_zone_gc_end_io; 746 xfs_zone_gc_add_data(chunk); 747 data->scratch_head = 748 (data->scratch_head + chunk->len) % data->scratch_size; 749 data->scratch_available -= chunk->len; 750 751 XFS_STATS_INC(mp, xs_gc_read_calls); 752 753 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 754 list_add_tail(&chunk->entry, &data->reading); 755 xfs_zone_gc_iter_advance(iter, irec.rm_blockcount); 756 757 submit_bio(bio); 758 return true; 759 } 760 761 static void 762 xfs_zone_gc_free_chunk( 763 struct xfs_gc_bio *chunk) 764 { 765 atomic_dec(&chunk->victim_rtg->rtg_gccount); 766 xfs_rtgroup_rele(chunk->victim_rtg); 767 list_del(&chunk->entry); 768 xfs_open_zone_put(chunk->oz); 769 xfs_irele(chunk->ip); 770 bio_put(&chunk->bio); 771 } 772 773 static void 774 xfs_zone_gc_submit_write( 775 struct xfs_zone_gc_data *data, 776 struct xfs_gc_bio *chunk) 777 { 778 if (chunk->is_seq) { 779 chunk->bio.bi_opf &= ~REQ_OP_WRITE; 780 chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND; 781 } 782 chunk->bio.bi_iter.bi_sector = chunk->new_daddr; 783 chunk->bio.bi_end_io = xfs_zone_gc_end_io; 784 submit_bio(&chunk->bio); 785 } 786 787 static struct xfs_gc_bio * 788 xfs_zone_gc_split_write( 789 struct xfs_zone_gc_data *data, 790 struct xfs_gc_bio *chunk) 791 { 792 struct queue_limits *lim = 793 &bdev_get_queue(chunk->bio.bi_bdev)->limits; 794 struct xfs_gc_bio *split_chunk; 795 int split_sectors; 796 unsigned int split_len; 797 struct bio *split; 798 unsigned int nsegs; 799 800 if (!chunk->is_seq) 801 return NULL; 802 803 split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs, 804 lim->max_zone_append_sectors << SECTOR_SHIFT); 805 if (!split_sectors) 806 return NULL; 807 808 /* ensure the split chunk is still block size aligned */ 809 split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT, 810 data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT; 811 split_len = split_sectors << SECTOR_SHIFT; 812 813 split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set); 814 split_chunk = container_of(split, struct xfs_gc_bio, bio); 815 split_chunk->data = data; 816 ihold(VFS_I(chunk->ip)); 817 split_chunk->ip = chunk->ip; 818 split_chunk->is_seq = chunk->is_seq; 819 split_chunk->offset = chunk->offset; 820 split_chunk->len = split_len; 821 split_chunk->old_startblock = chunk->old_startblock; 822 split_chunk->new_daddr = chunk->new_daddr; 823 split_chunk->oz = chunk->oz; 824 atomic_inc(&chunk->oz->oz_ref); 825 826 split_chunk->victim_rtg = chunk->victim_rtg; 827 atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref); 828 atomic_inc(&chunk->victim_rtg->rtg_gccount); 829 830 chunk->offset += split_len; 831 chunk->len -= split_len; 832 chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); 833 834 /* add right before the original chunk */ 835 WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW); 836 list_add_tail(&split_chunk->entry, &chunk->entry); 837 return split_chunk; 838 } 839 840 static void 841 xfs_zone_gc_write_chunk( 842 struct xfs_gc_bio *chunk) 843 { 844 struct xfs_zone_gc_data *data = chunk->data; 845 struct xfs_mount *mp = chunk->ip->i_mount; 846 struct xfs_gc_bio *split_chunk; 847 848 if (chunk->bio.bi_status) 849 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 850 if (xfs_is_shutdown(mp)) { 851 xfs_zone_gc_free_chunk(chunk); 852 return; 853 } 854 855 XFS_STATS_INC(mp, xs_gc_write_calls); 856 XFS_STATS_ADD(mp, xs_gc_bytes, chunk->len); 857 858 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 859 list_move_tail(&chunk->entry, &data->writing); 860 861 bio_reuse(&chunk->bio, REQ_OP_WRITE); 862 while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) 863 xfs_zone_gc_submit_write(data, split_chunk); 864 xfs_zone_gc_submit_write(data, chunk); 865 } 866 867 static void 868 xfs_zone_gc_finish_chunk( 869 struct xfs_gc_bio *chunk) 870 { 871 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; 872 struct xfs_zone_gc_data *data = chunk->data; 873 struct xfs_inode *ip = chunk->ip; 874 struct xfs_mount *mp = ip->i_mount; 875 int error; 876 877 if (chunk->bio.bi_status) 878 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 879 if (xfs_is_shutdown(mp)) { 880 xfs_zone_gc_free_chunk(chunk); 881 return; 882 } 883 884 data->scratch_tail = 885 (data->scratch_tail + chunk->len) % data->scratch_size; 886 data->scratch_available += chunk->len; 887 888 /* 889 * Cycle through the iolock and wait for direct I/O and layouts to 890 * ensure no one is reading from the old mapping before it goes away. 891 * 892 * Note that xfs_zoned_end_io() below checks that no other writer raced 893 * with us to update the mapping by checking that the old startblock 894 * didn't change. 895 */ 896 xfs_ilock(ip, iolock); 897 error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP); 898 if (!error) 899 inode_dio_wait(VFS_I(ip)); 900 xfs_iunlock(ip, iolock); 901 if (error) 902 goto free; 903 904 if (chunk->is_seq) 905 chunk->new_daddr = chunk->bio.bi_iter.bi_sector; 906 error = xfs_zoned_end_io(ip, chunk->offset, chunk->len, 907 chunk->new_daddr, chunk->oz, chunk->old_startblock); 908 free: 909 if (error) 910 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 911 xfs_zone_gc_free_chunk(chunk); 912 } 913 914 static void 915 xfs_zone_gc_finish_reset( 916 struct xfs_gc_bio *chunk) 917 { 918 struct xfs_rtgroup *rtg = chunk->bio.bi_private; 919 struct xfs_mount *mp = rtg_mount(rtg); 920 struct xfs_zone_info *zi = mp->m_zone_info; 921 922 if (chunk->bio.bi_status) { 923 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); 924 goto out; 925 } 926 927 xfs_group_set_mark(rtg_group(rtg), XFS_RTG_FREE); 928 atomic_inc(&zi->zi_nr_free_zones); 929 930 xfs_zoned_add_available(mp, rtg_blocks(rtg)); 931 932 wake_up_all(&zi->zi_zone_wait); 933 out: 934 list_del(&chunk->entry); 935 bio_put(&chunk->bio); 936 } 937 938 static void 939 xfs_submit_zone_reset_bio( 940 struct bio *bio, 941 void *priv) 942 { 943 struct xfs_rtgroup *rtg = priv; 944 struct xfs_mount *mp = rtg_mount(rtg); 945 946 trace_xfs_zone_reset(rtg); 947 948 ASSERT(rtg_rmap(rtg)->i_used_blocks == 0); 949 950 if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ZONE_RESET)) { 951 bio_io_error(bio); 952 return; 953 } 954 955 XFS_STATS_INC(mp, xs_gc_zone_reset_calls); 956 957 bio->bi_iter.bi_sector = xfs_gbno_to_daddr(rtg_group(rtg), 0); 958 if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { 959 /* 960 * Also use the bio to drive the state machine when neither 961 * zone reset nor discard is supported to keep things simple. 962 */ 963 if (!bdev_max_discard_sectors(bio->bi_bdev)) { 964 bio_endio(bio); 965 return; 966 } 967 bio->bi_opf &= ~REQ_OP_ZONE_RESET; 968 bio->bi_opf |= REQ_OP_DISCARD; 969 bio->bi_iter.bi_size = XFS_FSB_TO_B(mp, rtg_blocks(rtg)); 970 } 971 972 submit_bio(bio); 973 } 974 975 int 976 xfs_zone_gc_reset_sync( 977 struct xfs_rtgroup *rtg) 978 { 979 struct bio bio; 980 int error; 981 982 bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, 983 REQ_OP_ZONE_RESET | REQ_SYNC); 984 bio_await(&bio, rtg, xfs_submit_zone_reset_bio); 985 error = blk_status_to_errno(bio.bi_status); 986 bio_uninit(&bio); 987 return error; 988 } 989 990 static void 991 xfs_zone_gc_reset_zones( 992 struct xfs_zone_gc_data *data, 993 struct xfs_group *reset_list) 994 { 995 struct xfs_group *next = reset_list; 996 997 if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) { 998 xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR); 999 return; 1000 } 1001 1002 do { 1003 struct xfs_rtgroup *rtg = to_rtg(next); 1004 struct xfs_gc_bio *chunk; 1005 struct bio *bio; 1006 1007 xfs_log_force_inode(rtg_rmap(rtg)); 1008 1009 next = rtg_group(rtg)->xg_next_reset; 1010 rtg_group(rtg)->xg_next_reset = NULL; 1011 1012 bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, 1013 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set); 1014 bio->bi_private = rtg; 1015 bio->bi_end_io = xfs_zone_gc_end_io; 1016 1017 chunk = container_of(bio, struct xfs_gc_bio, bio); 1018 chunk->data = data; 1019 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); 1020 list_add_tail(&chunk->entry, &data->resetting); 1021 xfs_submit_zone_reset_bio(bio, rtg); 1022 } while (next); 1023 } 1024 1025 /* 1026 * Handle the work to read and write data for GC and to reset the zones, 1027 * including handling all completions. 1028 * 1029 * Note that the order of the chunks is preserved so that we don't undo the 1030 * optimal order established by xfs_zone_gc_query(). 1031 */ 1032 static void 1033 xfs_zone_gc_handle_work( 1034 struct xfs_zone_gc_data *data) 1035 { 1036 struct xfs_zone_info *zi = data->mp->m_zone_info; 1037 struct xfs_gc_bio *chunk, *next; 1038 struct xfs_group *reset_list; 1039 struct blk_plug plug; 1040 1041 spin_lock(&zi->zi_reset_list_lock); 1042 reset_list = zi->zi_reset_list; 1043 zi->zi_reset_list = NULL; 1044 spin_unlock(&zi->zi_reset_list_lock); 1045 1046 if (reset_list) { 1047 set_current_state(TASK_RUNNING); 1048 xfs_zone_gc_reset_zones(data, reset_list); 1049 } 1050 1051 list_for_each_entry_safe(chunk, next, &data->resetting, entry) { 1052 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1053 break; 1054 set_current_state(TASK_RUNNING); 1055 xfs_zone_gc_finish_reset(chunk); 1056 } 1057 1058 list_for_each_entry_safe(chunk, next, &data->writing, entry) { 1059 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1060 break; 1061 set_current_state(TASK_RUNNING); 1062 xfs_zone_gc_finish_chunk(chunk); 1063 } 1064 1065 blk_start_plug(&plug); 1066 list_for_each_entry_safe(chunk, next, &data->reading, entry) { 1067 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) 1068 break; 1069 set_current_state(TASK_RUNNING); 1070 xfs_zone_gc_write_chunk(chunk); 1071 } 1072 blk_finish_plug(&plug); 1073 1074 blk_start_plug(&plug); 1075 while (xfs_zone_gc_start_chunk(data)) 1076 ; 1077 blk_finish_plug(&plug); 1078 } 1079 1080 /* 1081 * Note that the current GC algorithm would break reflinks and thus duplicate 1082 * data that was shared by multiple owners before. Because of that reflinks 1083 * are currently not supported on zoned file systems and can't be created or 1084 * mounted. 1085 */ 1086 static int 1087 xfs_zoned_gcd( 1088 void *private) 1089 { 1090 struct xfs_zone_gc_data *data = private; 1091 struct xfs_mount *mp = data->mp; 1092 struct xfs_zone_info *zi = mp->m_zone_info; 1093 unsigned int nofs_flag; 1094 1095 nofs_flag = memalloc_nofs_save(); 1096 set_freezable(); 1097 1098 for (;;) { 1099 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); 1100 xfs_set_zonegc_running(mp); 1101 1102 xfs_zone_gc_handle_work(data); 1103 1104 /* 1105 * Only sleep if nothing set the state to running. Else check for 1106 * work again as someone might have queued up more work and woken 1107 * us in the meantime. 1108 */ 1109 if (get_current_state() == TASK_RUNNING) { 1110 try_to_freeze(); 1111 continue; 1112 } 1113 1114 if (list_empty(&data->reading) && 1115 list_empty(&data->writing) && 1116 list_empty(&data->resetting) && 1117 !zi->zi_reset_list) { 1118 xfs_clear_zonegc_running(mp); 1119 xfs_zoned_resv_wake_all(mp); 1120 1121 if (kthread_should_stop()) { 1122 __set_current_state(TASK_RUNNING); 1123 break; 1124 } 1125 1126 if (kthread_should_park()) { 1127 __set_current_state(TASK_RUNNING); 1128 kthread_parkme(); 1129 continue; 1130 } 1131 } 1132 1133 schedule(); 1134 } 1135 xfs_clear_zonegc_running(mp); 1136 1137 if (data->oz) 1138 xfs_open_zone_put(data->oz); 1139 if (data->iter.victim_rtg) 1140 xfs_rtgroup_rele(data->iter.victim_rtg); 1141 1142 memalloc_nofs_restore(nofs_flag); 1143 xfs_zone_gc_data_free(data); 1144 return 0; 1145 } 1146 1147 void 1148 xfs_zone_gc_start( 1149 struct xfs_mount *mp) 1150 { 1151 if (xfs_has_zoned(mp)) 1152 kthread_unpark(mp->m_zone_info->zi_gc_thread); 1153 } 1154 1155 void 1156 xfs_zone_gc_stop( 1157 struct xfs_mount *mp) 1158 { 1159 if (xfs_has_zoned(mp)) 1160 kthread_park(mp->m_zone_info->zi_gc_thread); 1161 } 1162 1163 void 1164 xfs_zone_gc_wakeup( 1165 struct xfs_mount *mp) 1166 { 1167 struct super_block *sb = mp->m_super; 1168 1169 /* 1170 * If we are unmounting the file system we must not try to 1171 * wake gc as m_zone_info might have been freed already. 1172 */ 1173 if (down_read_trylock(&sb->s_umount)) { 1174 if (!xfs_is_readonly(mp)) 1175 wake_up_process(mp->m_zone_info->zi_gc_thread); 1176 up_read(&sb->s_umount); 1177 } 1178 } 1179 1180 int 1181 xfs_zone_gc_mount( 1182 struct xfs_mount *mp) 1183 { 1184 struct xfs_zone_info *zi = mp->m_zone_info; 1185 struct xfs_zone_gc_data *data; 1186 int error; 1187 1188 data = xfs_zone_gc_data_alloc(mp); 1189 if (!data) 1190 return -ENOMEM; 1191 1192 /* 1193 * If there are no free zones available for GC, or the number of open 1194 * zones has reached the open zone limit, pick the open zone with 1195 * the least used space to GC into. This should only happen after an 1196 * unclean shutdown while GC was ongoing. Otherwise a GC zone will 1197 * be selected from the free zone pool on demand. 1198 */ 1199 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || 1200 zi->zi_nr_open_zones >= mp->m_max_open_zones) { 1201 error = xfs_zone_gc_steal_open_zone(data); 1202 if (error) { 1203 xfs_warn(mp, "unable to steal an open zone for gc"); 1204 goto out_free_gc_data; 1205 } 1206 } 1207 1208 zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, 1209 "xfs-zone-gc/%s", mp->m_super->s_id); 1210 if (IS_ERR(zi->zi_gc_thread)) { 1211 xfs_warn(mp, "unable to create zone gc thread"); 1212 error = PTR_ERR(zi->zi_gc_thread); 1213 goto out_put_oz; 1214 } 1215 1216 /* xfs_zone_gc_start will unpark for rw mounts */ 1217 kthread_park(zi->zi_gc_thread); 1218 return 0; 1219 1220 out_put_oz: 1221 if (data->oz) 1222 xfs_open_zone_put(data->oz); 1223 out_free_gc_data: 1224 xfs_zone_gc_data_free(data); 1225 return error; 1226 } 1227 1228 void 1229 xfs_zone_gc_unmount( 1230 struct xfs_mount *mp) 1231 { 1232 struct xfs_zone_info *zi = mp->m_zone_info; 1233 1234 kthread_stop(zi->zi_gc_thread); 1235 } 1236