1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2023-2025 Christoph Hellwig.
4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5 */
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "xfs_trans.h"
15 #include "xfs_icache.h"
16 #include "xfs_rmap.h"
17 #include "xfs_rtbitmap.h"
18 #include "xfs_rtrmap_btree.h"
19 #include "xfs_zone_alloc.h"
20 #include "xfs_zone_priv.h"
21 #include "xfs_zones.h"
22 #include "xfs_trace.h"
23
24 /*
25 * Implement Garbage Collection (GC) of partially used zoned.
26 *
27 * To support the purely sequential writes in each zone, zoned XFS needs to be
28 * able to move data remaining in a zone out of it to reset the zone to prepare
29 * for writing to it again.
30 *
31 * This is done by the GC thread implemented in this file. To support that a
32 * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
33 * write the garbage collected data into.
34 *
35 * Whenever the available space is below the chosen threshold, the GC thread
36 * looks for potential non-empty but not fully used zones that are worth
37 * reclaiming. Once found the rmap for the victim zone is queried, and after
38 * a bit of sorting to reduce fragmentation, the still live extents are read
39 * into memory and written to the GC target zone, and the bmap btree of the
40 * files is updated to point to the new location. To avoid taking the IOLOCK
41 * and MMAPLOCK for the entire GC process and thus affecting the latency of
42 * user reads and writes to the files, the GC writes are speculative and the
43 * I/O completion checks that no other writes happened for the affected regions
44 * before remapping.
45 *
46 * Once a zone does not contain any valid data, be that through GC or user
47 * block removal, it is queued for for a zone reset. The reset operation
48 * carefully ensures that the RT device cache is flushed and all transactions
49 * referencing the rmap have been committed to disk.
50 */
51
52 /*
53 * Size of each GC scratch pad. This is also the upper bound for each
54 * GC I/O, which helps to keep latency down.
55 */
56 #define XFS_GC_CHUNK_SIZE SZ_1M
57
58 /*
59 * Scratchpad data to read GCed data into.
60 *
61 * The offset member tracks where the next allocation starts, and freed tracks
62 * the amount of space that is not used anymore.
63 */
64 #define XFS_ZONE_GC_NR_SCRATCH 2
65 struct xfs_zone_scratch {
66 struct folio *folio;
67 unsigned int offset;
68 unsigned int freed;
69 };
70
71 /*
72 * Chunk that is read and written for each GC operation.
73 *
74 * Note that for writes to actual zoned devices, the chunk can be split when
75 * reaching the hardware limit.
76 */
77 struct xfs_gc_bio {
78 struct xfs_zone_gc_data *data;
79
80 /*
81 * Entry into the reading/writing/resetting list. Only accessed from
82 * the GC thread, so no locking needed.
83 */
84 struct list_head entry;
85
86 /*
87 * State of this gc_bio. Done means the current I/O completed.
88 * Set from the bio end I/O handler, read from the GC thread.
89 */
90 enum {
91 XFS_GC_BIO_NEW,
92 XFS_GC_BIO_DONE,
93 } state;
94
95 /*
96 * Pointer to the inode and byte range in the inode that this
97 * GC chunk is operating on.
98 */
99 struct xfs_inode *ip;
100 loff_t offset;
101 unsigned int len;
102
103 /*
104 * Existing startblock (in the zone to be freed) and newly assigned
105 * daddr in the zone GCed into.
106 */
107 xfs_fsblock_t old_startblock;
108 xfs_daddr_t new_daddr;
109 struct xfs_zone_scratch *scratch;
110
111 /* Are we writing to a sequential write required zone? */
112 bool is_seq;
113
114 /* Open Zone being written to */
115 struct xfs_open_zone *oz;
116
117 struct xfs_rtgroup *victim_rtg;
118
119 /* Bio used for reads and writes, including the bvec used by it */
120 struct bio_vec bv;
121 struct bio bio; /* must be last */
122 };
123
124 #define XFS_ZONE_GC_RECS 1024
125
126 /* iterator, needs to be reinitialized for each victim zone */
127 struct xfs_zone_gc_iter {
128 struct xfs_rtgroup *victim_rtg;
129 unsigned int rec_count;
130 unsigned int rec_idx;
131 xfs_agblock_t next_startblock;
132 struct xfs_rmap_irec *recs;
133 };
134
135 /*
136 * Per-mount GC state.
137 */
138 struct xfs_zone_gc_data {
139 struct xfs_mount *mp;
140
141 /* bioset used to allocate the gc_bios */
142 struct bio_set bio_set;
143
144 /*
145 * Scratchpad used, and index to indicated which one is used.
146 */
147 struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH];
148 unsigned int scratch_idx;
149
150 /*
151 * List of bios currently being read, written and reset.
152 * These lists are only accessed by the GC thread itself, and must only
153 * be processed in order.
154 */
155 struct list_head reading;
156 struct list_head writing;
157 struct list_head resetting;
158
159 /*
160 * Iterator for the victim zone.
161 */
162 struct xfs_zone_gc_iter iter;
163 };
164
165 /*
166 * We aim to keep enough zones free in stock to fully use the open zone limit
167 * for data placement purposes. Additionally, the m_zonegc_low_space tunable
168 * can be set to make sure a fraction of the unused blocks are available for
169 * writing.
170 */
171 bool
xfs_zoned_need_gc(struct xfs_mount * mp)172 xfs_zoned_need_gc(
173 struct xfs_mount *mp)
174 {
175 s64 available, free, threshold;
176 s32 remainder;
177
178 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
179 return false;
180
181 available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
182
183 if (available <
184 mp->m_groups[XG_TYPE_RTG].blocks *
185 (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
186 return true;
187
188 free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
189
190 threshold = div_s64_rem(free, 100, &remainder);
191 threshold = threshold * mp->m_zonegc_low_space +
192 remainder * div_s64(mp->m_zonegc_low_space, 100);
193
194 if (available < threshold)
195 return true;
196
197 return false;
198 }
199
200 static struct xfs_zone_gc_data *
xfs_zone_gc_data_alloc(struct xfs_mount * mp)201 xfs_zone_gc_data_alloc(
202 struct xfs_mount *mp)
203 {
204 struct xfs_zone_gc_data *data;
205 int i;
206
207 data = kzalloc(sizeof(*data), GFP_KERNEL);
208 if (!data)
209 return NULL;
210 data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
211 GFP_KERNEL);
212 if (!data->iter.recs)
213 goto out_free_data;
214
215 /*
216 * We actually only need a single bio_vec. It would be nice to have
217 * a flag that only allocates the inline bvecs and not the separate
218 * bvec pool.
219 */
220 if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
221 BIOSET_NEED_BVECS))
222 goto out_free_recs;
223 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
224 data->scratch[i].folio =
225 folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
226 if (!data->scratch[i].folio)
227 goto out_free_scratch;
228 }
229 INIT_LIST_HEAD(&data->reading);
230 INIT_LIST_HEAD(&data->writing);
231 INIT_LIST_HEAD(&data->resetting);
232 data->mp = mp;
233 return data;
234
235 out_free_scratch:
236 while (--i >= 0)
237 folio_put(data->scratch[i].folio);
238 bioset_exit(&data->bio_set);
239 out_free_recs:
240 kfree(data->iter.recs);
241 out_free_data:
242 kfree(data);
243 return NULL;
244 }
245
246 static void
xfs_zone_gc_data_free(struct xfs_zone_gc_data * data)247 xfs_zone_gc_data_free(
248 struct xfs_zone_gc_data *data)
249 {
250 int i;
251
252 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
253 folio_put(data->scratch[i].folio);
254 bioset_exit(&data->bio_set);
255 kfree(data->iter.recs);
256 kfree(data);
257 }
258
259 static void
xfs_zone_gc_iter_init(struct xfs_zone_gc_iter * iter,struct xfs_rtgroup * victim_rtg)260 xfs_zone_gc_iter_init(
261 struct xfs_zone_gc_iter *iter,
262 struct xfs_rtgroup *victim_rtg)
263
264 {
265 iter->next_startblock = 0;
266 iter->rec_count = 0;
267 iter->rec_idx = 0;
268 iter->victim_rtg = victim_rtg;
269 atomic_inc(&victim_rtg->rtg_gccount);
270 }
271
272 /*
273 * Query the rmap of the victim zone to gather the records to evacuate.
274 */
275 static int
xfs_zone_gc_query_cb(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * irec,void * private)276 xfs_zone_gc_query_cb(
277 struct xfs_btree_cur *cur,
278 const struct xfs_rmap_irec *irec,
279 void *private)
280 {
281 struct xfs_zone_gc_iter *iter = private;
282
283 ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
284 ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
285 ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
286
287 iter->recs[iter->rec_count] = *irec;
288 if (++iter->rec_count == XFS_ZONE_GC_RECS) {
289 iter->next_startblock =
290 irec->rm_startblock + irec->rm_blockcount;
291 return 1;
292 }
293 return 0;
294 }
295
296 static int
xfs_zone_gc_rmap_rec_cmp(const void * a,const void * b)297 xfs_zone_gc_rmap_rec_cmp(
298 const void *a,
299 const void *b)
300 {
301 const struct xfs_rmap_irec *reca = a;
302 const struct xfs_rmap_irec *recb = b;
303 int diff;
304
305 diff = cmp_int(reca->rm_owner, recb->rm_owner);
306 if (diff)
307 return diff;
308 return cmp_int(reca->rm_offset, recb->rm_offset);
309 }
310
311 static int
xfs_zone_gc_query(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter)312 xfs_zone_gc_query(
313 struct xfs_mount *mp,
314 struct xfs_zone_gc_iter *iter)
315 {
316 struct xfs_rtgroup *rtg = iter->victim_rtg;
317 struct xfs_rmap_irec ri_low = { };
318 struct xfs_rmap_irec ri_high;
319 struct xfs_btree_cur *cur;
320 struct xfs_trans *tp;
321 int error;
322
323 ASSERT(iter->next_startblock <= rtg_blocks(rtg));
324 if (iter->next_startblock == rtg_blocks(rtg))
325 goto done;
326
327 ASSERT(iter->next_startblock < rtg_blocks(rtg));
328 ri_low.rm_startblock = iter->next_startblock;
329 memset(&ri_high, 0xFF, sizeof(ri_high));
330
331 iter->rec_idx = 0;
332 iter->rec_count = 0;
333
334 tp = xfs_trans_alloc_empty(mp);
335 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
336 cur = xfs_rtrmapbt_init_cursor(tp, rtg);
337 error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
338 xfs_zone_gc_query_cb, iter);
339 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
340 xfs_btree_del_cursor(cur, error < 0 ? error : 0);
341 xfs_trans_cancel(tp);
342
343 if (error < 0)
344 return error;
345
346 /*
347 * Sort the rmap records by inode number and increasing offset to
348 * defragment the mappings.
349 *
350 * This could be further enhanced by an even bigger look ahead window,
351 * but that's better left until we have better detection of changes to
352 * inode mapping to avoid the potential of GCing already dead data.
353 */
354 sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
355 xfs_zone_gc_rmap_rec_cmp, NULL);
356
357 if (error == 0) {
358 /*
359 * We finished iterating through the zone.
360 */
361 iter->next_startblock = rtg_blocks(rtg);
362 if (iter->rec_count == 0)
363 goto done;
364 }
365
366 return 0;
367 done:
368 atomic_dec(&iter->victim_rtg->rtg_gccount);
369 xfs_rtgroup_rele(iter->victim_rtg);
370 iter->victim_rtg = NULL;
371 return 0;
372 }
373
374 static bool
xfs_zone_gc_iter_next(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter,struct xfs_rmap_irec * chunk_rec,struct xfs_inode ** ipp)375 xfs_zone_gc_iter_next(
376 struct xfs_mount *mp,
377 struct xfs_zone_gc_iter *iter,
378 struct xfs_rmap_irec *chunk_rec,
379 struct xfs_inode **ipp)
380 {
381 struct xfs_rmap_irec *irec;
382 int error;
383
384 if (!iter->victim_rtg)
385 return false;
386
387 retry:
388 if (iter->rec_idx == iter->rec_count) {
389 error = xfs_zone_gc_query(mp, iter);
390 if (error)
391 goto fail;
392 if (!iter->victim_rtg)
393 return false;
394 }
395
396 irec = &iter->recs[iter->rec_idx];
397 error = xfs_iget(mp, NULL, irec->rm_owner,
398 XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
399 if (error) {
400 /*
401 * If the inode was already deleted, skip over it.
402 */
403 if (error == -ENOENT) {
404 iter->rec_idx++;
405 goto retry;
406 }
407 goto fail;
408 }
409
410 if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
411 iter->rec_idx++;
412 xfs_irele(*ipp);
413 goto retry;
414 }
415
416 *chunk_rec = *irec;
417 return true;
418
419 fail:
420 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
421 return false;
422 }
423
424 static void
xfs_zone_gc_iter_advance(struct xfs_zone_gc_iter * iter,xfs_extlen_t count_fsb)425 xfs_zone_gc_iter_advance(
426 struct xfs_zone_gc_iter *iter,
427 xfs_extlen_t count_fsb)
428 {
429 struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx];
430
431 irec->rm_offset += count_fsb;
432 irec->rm_startblock += count_fsb;
433 irec->rm_blockcount -= count_fsb;
434 if (!irec->rm_blockcount)
435 iter->rec_idx++;
436 }
437
438 static struct xfs_rtgroup *
xfs_zone_gc_pick_victim_from(struct xfs_mount * mp,uint32_t bucket)439 xfs_zone_gc_pick_victim_from(
440 struct xfs_mount *mp,
441 uint32_t bucket)
442 {
443 struct xfs_zone_info *zi = mp->m_zone_info;
444 uint32_t victim_used = U32_MAX;
445 struct xfs_rtgroup *victim_rtg = NULL;
446 uint32_t bit;
447
448 if (!zi->zi_used_bucket_entries[bucket])
449 return NULL;
450
451 for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
452 mp->m_sb.sb_rgcount) {
453 struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
454
455 if (!rtg)
456 continue;
457
458 /*
459 * If the zone is already undergoing GC, don't pick it again.
460 *
461 * This prevents us from picking one of the zones for which we
462 * already submitted GC I/O, but for which the remapping hasn't
463 * concluded yet. This won't cause data corruption, but
464 * increases write amplification and slows down GC, so this is
465 * a bad thing.
466 */
467 if (atomic_read(&rtg->rtg_gccount)) {
468 xfs_rtgroup_rele(rtg);
469 continue;
470 }
471
472 /* skip zones that are just waiting for a reset */
473 if (rtg_rmap(rtg)->i_used_blocks == 0 ||
474 rtg_rmap(rtg)->i_used_blocks >= victim_used) {
475 xfs_rtgroup_rele(rtg);
476 continue;
477 }
478
479 if (victim_rtg)
480 xfs_rtgroup_rele(victim_rtg);
481 victim_rtg = rtg;
482 victim_used = rtg_rmap(rtg)->i_used_blocks;
483
484 /*
485 * Any zone that is less than 1 percent used is fair game for
486 * instant reclaim. All of these zones are in the last
487 * bucket, so avoid the expensive division for the zones
488 * in the other buckets.
489 */
490 if (bucket == 0 &&
491 rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
492 break;
493 }
494
495 return victim_rtg;
496 }
497
498 /*
499 * Iterate through all zones marked as reclaimable and find a candidate to
500 * reclaim.
501 */
502 static bool
xfs_zone_gc_select_victim(struct xfs_zone_gc_data * data)503 xfs_zone_gc_select_victim(
504 struct xfs_zone_gc_data *data)
505 {
506 struct xfs_zone_gc_iter *iter = &data->iter;
507 struct xfs_mount *mp = data->mp;
508 struct xfs_zone_info *zi = mp->m_zone_info;
509 struct xfs_rtgroup *victim_rtg = NULL;
510 unsigned int bucket;
511
512 spin_lock(&zi->zi_used_buckets_lock);
513 for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
514 victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
515 if (victim_rtg)
516 break;
517 }
518 spin_unlock(&zi->zi_used_buckets_lock);
519
520 if (!victim_rtg)
521 return false;
522
523 trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
524 xfs_zone_gc_iter_init(iter, victim_rtg);
525 return true;
526 }
527
528 static struct xfs_open_zone *
xfs_zone_gc_steal_open(struct xfs_zone_info * zi)529 xfs_zone_gc_steal_open(
530 struct xfs_zone_info *zi)
531 {
532 struct xfs_open_zone *oz, *found = NULL;
533
534 spin_lock(&zi->zi_open_zones_lock);
535 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
536 if (!found || oz->oz_allocated < found->oz_allocated)
537 found = oz;
538 }
539
540 if (found) {
541 found->oz_is_gc = true;
542 list_del_init(&found->oz_entry);
543 zi->zi_nr_open_zones--;
544 }
545
546 spin_unlock(&zi->zi_open_zones_lock);
547 return found;
548 }
549
550 static struct xfs_open_zone *
xfs_zone_gc_select_target(struct xfs_mount * mp)551 xfs_zone_gc_select_target(
552 struct xfs_mount *mp)
553 {
554 struct xfs_zone_info *zi = mp->m_zone_info;
555 struct xfs_open_zone *oz = zi->zi_open_gc_zone;
556
557 /*
558 * We need to wait for pending writes to finish.
559 */
560 if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
561 return NULL;
562
563 ASSERT(zi->zi_nr_open_zones <=
564 mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
565 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
566 if (oz)
567 trace_xfs_zone_gc_target_opened(oz->oz_rtg);
568 spin_lock(&zi->zi_open_zones_lock);
569 zi->zi_open_gc_zone = oz;
570 spin_unlock(&zi->zi_open_zones_lock);
571 return oz;
572 }
573
574 /*
575 * Ensure we have a valid open zone to write the GC data to.
576 *
577 * If the current target zone has space keep writing to it, else first wait for
578 * all pending writes and then pick a new one.
579 */
580 static struct xfs_open_zone *
xfs_zone_gc_ensure_target(struct xfs_mount * mp)581 xfs_zone_gc_ensure_target(
582 struct xfs_mount *mp)
583 {
584 struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone;
585
586 if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
587 return xfs_zone_gc_select_target(mp);
588 return oz;
589 }
590
591 static unsigned int
xfs_zone_gc_scratch_available(struct xfs_zone_gc_data * data)592 xfs_zone_gc_scratch_available(
593 struct xfs_zone_gc_data *data)
594 {
595 return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
596 }
597
598 static bool
xfs_zone_gc_space_available(struct xfs_zone_gc_data * data)599 xfs_zone_gc_space_available(
600 struct xfs_zone_gc_data *data)
601 {
602 struct xfs_open_zone *oz;
603
604 oz = xfs_zone_gc_ensure_target(data->mp);
605 if (!oz)
606 return false;
607 return oz->oz_allocated < rtg_blocks(oz->oz_rtg) &&
608 xfs_zone_gc_scratch_available(data);
609 }
610
611 static void
xfs_zone_gc_end_io(struct bio * bio)612 xfs_zone_gc_end_io(
613 struct bio *bio)
614 {
615 struct xfs_gc_bio *chunk =
616 container_of(bio, struct xfs_gc_bio, bio);
617 struct xfs_zone_gc_data *data = chunk->data;
618
619 WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
620 wake_up_process(data->mp->m_zone_info->zi_gc_thread);
621 }
622
623 static struct xfs_open_zone *
xfs_zone_gc_alloc_blocks(struct xfs_zone_gc_data * data,xfs_extlen_t * count_fsb,xfs_daddr_t * daddr,bool * is_seq)624 xfs_zone_gc_alloc_blocks(
625 struct xfs_zone_gc_data *data,
626 xfs_extlen_t *count_fsb,
627 xfs_daddr_t *daddr,
628 bool *is_seq)
629 {
630 struct xfs_mount *mp = data->mp;
631 struct xfs_open_zone *oz;
632
633 oz = xfs_zone_gc_ensure_target(mp);
634 if (!oz)
635 return NULL;
636
637 *count_fsb = min(*count_fsb,
638 XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
639
640 /*
641 * Directly allocate GC blocks from the reserved pool.
642 *
643 * If we'd take them from the normal pool we could be stealing blocks
644 * from a regular writer, which would then have to wait for GC and
645 * deadlock.
646 */
647 spin_lock(&mp->m_sb_lock);
648 *count_fsb = min(*count_fsb,
649 rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
650 *count_fsb = min3(*count_fsb,
651 mp->m_free[XC_FREE_RTEXTENTS].res_avail,
652 mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
653 mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
654 mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
655 spin_unlock(&mp->m_sb_lock);
656
657 if (!*count_fsb)
658 return NULL;
659
660 *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
661 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
662 if (!*is_seq)
663 *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
664 oz->oz_allocated += *count_fsb;
665 atomic_inc(&oz->oz_ref);
666 return oz;
667 }
668
669 static bool
xfs_zone_gc_start_chunk(struct xfs_zone_gc_data * data)670 xfs_zone_gc_start_chunk(
671 struct xfs_zone_gc_data *data)
672 {
673 struct xfs_zone_gc_iter *iter = &data->iter;
674 struct xfs_mount *mp = data->mp;
675 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
676 struct xfs_open_zone *oz;
677 struct xfs_rmap_irec irec;
678 struct xfs_gc_bio *chunk;
679 struct xfs_inode *ip;
680 struct bio *bio;
681 xfs_daddr_t daddr;
682 bool is_seq;
683
684 if (xfs_is_shutdown(mp))
685 return false;
686
687 if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
688 return false;
689 oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
690 &is_seq);
691 if (!oz) {
692 xfs_irele(ip);
693 return false;
694 }
695
696 bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
697
698 chunk = container_of(bio, struct xfs_gc_bio, bio);
699 chunk->ip = ip;
700 chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
701 chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
702 chunk->old_startblock =
703 xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
704 chunk->new_daddr = daddr;
705 chunk->is_seq = is_seq;
706 chunk->scratch = &data->scratch[data->scratch_idx];
707 chunk->data = data;
708 chunk->oz = oz;
709 chunk->victim_rtg = iter->victim_rtg;
710 atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
711 atomic_inc(&chunk->victim_rtg->rtg_gccount);
712
713 bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
714 bio->bi_end_io = xfs_zone_gc_end_io;
715 bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
716 chunk->scratch->offset);
717 chunk->scratch->offset += chunk->len;
718 if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
719 data->scratch_idx =
720 (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
721 }
722 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
723 list_add_tail(&chunk->entry, &data->reading);
724 xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
725
726 submit_bio(bio);
727 return true;
728 }
729
730 static void
xfs_zone_gc_free_chunk(struct xfs_gc_bio * chunk)731 xfs_zone_gc_free_chunk(
732 struct xfs_gc_bio *chunk)
733 {
734 atomic_dec(&chunk->victim_rtg->rtg_gccount);
735 xfs_rtgroup_rele(chunk->victim_rtg);
736 list_del(&chunk->entry);
737 xfs_open_zone_put(chunk->oz);
738 xfs_irele(chunk->ip);
739 bio_put(&chunk->bio);
740 }
741
742 static void
xfs_zone_gc_submit_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)743 xfs_zone_gc_submit_write(
744 struct xfs_zone_gc_data *data,
745 struct xfs_gc_bio *chunk)
746 {
747 if (chunk->is_seq) {
748 chunk->bio.bi_opf &= ~REQ_OP_WRITE;
749 chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
750 }
751 chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
752 chunk->bio.bi_end_io = xfs_zone_gc_end_io;
753 submit_bio(&chunk->bio);
754 }
755
756 static struct xfs_gc_bio *
xfs_zone_gc_split_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)757 xfs_zone_gc_split_write(
758 struct xfs_zone_gc_data *data,
759 struct xfs_gc_bio *chunk)
760 {
761 struct queue_limits *lim =
762 &bdev_get_queue(chunk->bio.bi_bdev)->limits;
763 struct xfs_gc_bio *split_chunk;
764 int split_sectors;
765 unsigned int split_len;
766 struct bio *split;
767 unsigned int nsegs;
768
769 if (!chunk->is_seq)
770 return NULL;
771
772 split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
773 lim->max_zone_append_sectors << SECTOR_SHIFT);
774 if (!split_sectors)
775 return NULL;
776
777 /* ensure the split chunk is still block size aligned */
778 split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
779 data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
780 split_len = split_sectors << SECTOR_SHIFT;
781
782 split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
783 split_chunk = container_of(split, struct xfs_gc_bio, bio);
784 split_chunk->data = data;
785 ihold(VFS_I(chunk->ip));
786 split_chunk->ip = chunk->ip;
787 split_chunk->is_seq = chunk->is_seq;
788 split_chunk->scratch = chunk->scratch;
789 split_chunk->offset = chunk->offset;
790 split_chunk->len = split_len;
791 split_chunk->old_startblock = chunk->old_startblock;
792 split_chunk->new_daddr = chunk->new_daddr;
793 split_chunk->oz = chunk->oz;
794 atomic_inc(&chunk->oz->oz_ref);
795
796 split_chunk->victim_rtg = chunk->victim_rtg;
797 atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
798 atomic_inc(&chunk->victim_rtg->rtg_gccount);
799
800 chunk->offset += split_len;
801 chunk->len -= split_len;
802 chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
803
804 /* add right before the original chunk */
805 WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
806 list_add_tail(&split_chunk->entry, &chunk->entry);
807 return split_chunk;
808 }
809
810 static void
xfs_zone_gc_write_chunk(struct xfs_gc_bio * chunk)811 xfs_zone_gc_write_chunk(
812 struct xfs_gc_bio *chunk)
813 {
814 struct xfs_zone_gc_data *data = chunk->data;
815 struct xfs_mount *mp = chunk->ip->i_mount;
816 phys_addr_t bvec_paddr =
817 bvec_phys(bio_first_bvec_all(&chunk->bio));
818 struct xfs_gc_bio *split_chunk;
819
820 if (chunk->bio.bi_status)
821 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
822 if (xfs_is_shutdown(mp)) {
823 xfs_zone_gc_free_chunk(chunk);
824 return;
825 }
826
827 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
828 list_move_tail(&chunk->entry, &data->writing);
829
830 bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
831 bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
832 offset_in_folio(chunk->scratch->folio, bvec_paddr));
833
834 while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
835 xfs_zone_gc_submit_write(data, split_chunk);
836 xfs_zone_gc_submit_write(data, chunk);
837 }
838
839 static void
xfs_zone_gc_finish_chunk(struct xfs_gc_bio * chunk)840 xfs_zone_gc_finish_chunk(
841 struct xfs_gc_bio *chunk)
842 {
843 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
844 struct xfs_inode *ip = chunk->ip;
845 struct xfs_mount *mp = ip->i_mount;
846 int error;
847
848 if (chunk->bio.bi_status)
849 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
850 if (xfs_is_shutdown(mp)) {
851 xfs_zone_gc_free_chunk(chunk);
852 return;
853 }
854
855 chunk->scratch->freed += chunk->len;
856 if (chunk->scratch->freed == chunk->scratch->offset) {
857 chunk->scratch->offset = 0;
858 chunk->scratch->freed = 0;
859 }
860
861 /*
862 * Cycle through the iolock and wait for direct I/O and layouts to
863 * ensure no one is reading from the old mapping before it goes away.
864 *
865 * Note that xfs_zoned_end_io() below checks that no other writer raced
866 * with us to update the mapping by checking that the old startblock
867 * didn't change.
868 */
869 xfs_ilock(ip, iolock);
870 error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
871 if (!error)
872 inode_dio_wait(VFS_I(ip));
873 xfs_iunlock(ip, iolock);
874 if (error)
875 goto free;
876
877 if (chunk->is_seq)
878 chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
879 error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
880 chunk->new_daddr, chunk->oz, chunk->old_startblock);
881 free:
882 if (error)
883 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
884 xfs_zone_gc_free_chunk(chunk);
885 }
886
887 static void
xfs_zone_gc_finish_reset(struct xfs_gc_bio * chunk)888 xfs_zone_gc_finish_reset(
889 struct xfs_gc_bio *chunk)
890 {
891 struct xfs_rtgroup *rtg = chunk->bio.bi_private;
892 struct xfs_mount *mp = rtg_mount(rtg);
893 struct xfs_zone_info *zi = mp->m_zone_info;
894
895 if (chunk->bio.bi_status) {
896 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
897 goto out;
898 }
899
900 xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
901 atomic_inc(&zi->zi_nr_free_zones);
902
903 xfs_zoned_add_available(mp, rtg_blocks(rtg));
904
905 wake_up_all(&zi->zi_zone_wait);
906 out:
907 list_del(&chunk->entry);
908 bio_put(&chunk->bio);
909 }
910
911 static bool
xfs_zone_gc_prepare_reset(struct bio * bio,struct xfs_rtgroup * rtg)912 xfs_zone_gc_prepare_reset(
913 struct bio *bio,
914 struct xfs_rtgroup *rtg)
915 {
916 trace_xfs_zone_reset(rtg);
917
918 ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
919 bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
920 if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
921 if (!bdev_max_discard_sectors(bio->bi_bdev))
922 return false;
923 bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
924 bio->bi_iter.bi_size =
925 XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
926 }
927
928 return true;
929 }
930
931 int
xfs_zone_gc_reset_sync(struct xfs_rtgroup * rtg)932 xfs_zone_gc_reset_sync(
933 struct xfs_rtgroup *rtg)
934 {
935 int error = 0;
936 struct bio bio;
937
938 bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
939 REQ_OP_ZONE_RESET);
940 if (xfs_zone_gc_prepare_reset(&bio, rtg))
941 error = submit_bio_wait(&bio);
942 bio_uninit(&bio);
943
944 return error;
945 }
946
947 static void
xfs_zone_gc_reset_zones(struct xfs_zone_gc_data * data,struct xfs_group * reset_list)948 xfs_zone_gc_reset_zones(
949 struct xfs_zone_gc_data *data,
950 struct xfs_group *reset_list)
951 {
952 struct xfs_group *next = reset_list;
953
954 if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
955 xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
956 return;
957 }
958
959 do {
960 struct xfs_rtgroup *rtg = to_rtg(next);
961 struct xfs_gc_bio *chunk;
962 struct bio *bio;
963
964 xfs_log_force_inode(rtg_rmap(rtg));
965
966 next = rtg_group(rtg)->xg_next_reset;
967 rtg_group(rtg)->xg_next_reset = NULL;
968
969 bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
970 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
971 bio->bi_private = rtg;
972 bio->bi_end_io = xfs_zone_gc_end_io;
973
974 chunk = container_of(bio, struct xfs_gc_bio, bio);
975 chunk->data = data;
976 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
977 list_add_tail(&chunk->entry, &data->resetting);
978
979 /*
980 * Also use the bio to drive the state machine when neither
981 * zone reset nor discard is supported to keep things simple.
982 */
983 if (xfs_zone_gc_prepare_reset(bio, rtg))
984 submit_bio(bio);
985 else
986 bio_endio(bio);
987 } while (next);
988 }
989
990 static bool
xfs_zone_gc_should_start_new_work(struct xfs_zone_gc_data * data)991 xfs_zone_gc_should_start_new_work(
992 struct xfs_zone_gc_data *data)
993 {
994 if (xfs_is_shutdown(data->mp))
995 return false;
996 if (!xfs_zone_gc_space_available(data))
997 return false;
998
999 if (!data->iter.victim_rtg) {
1000 if (kthread_should_stop() || kthread_should_park())
1001 return false;
1002 if (!xfs_zoned_need_gc(data->mp))
1003 return false;
1004 if (!xfs_zone_gc_select_victim(data))
1005 return false;
1006 }
1007
1008 return true;
1009 }
1010
1011 /*
1012 * Handle the work to read and write data for GC and to reset the zones,
1013 * including handling all completions.
1014 *
1015 * Note that the order of the chunks is preserved so that we don't undo the
1016 * optimal order established by xfs_zone_gc_query().
1017 */
1018 static void
xfs_zone_gc_handle_work(struct xfs_zone_gc_data * data)1019 xfs_zone_gc_handle_work(
1020 struct xfs_zone_gc_data *data)
1021 {
1022 struct xfs_zone_info *zi = data->mp->m_zone_info;
1023 struct xfs_gc_bio *chunk, *next;
1024 struct xfs_group *reset_list;
1025 struct blk_plug plug;
1026
1027 spin_lock(&zi->zi_reset_list_lock);
1028 reset_list = zi->zi_reset_list;
1029 zi->zi_reset_list = NULL;
1030 spin_unlock(&zi->zi_reset_list_lock);
1031
1032 if (reset_list) {
1033 set_current_state(TASK_RUNNING);
1034 xfs_zone_gc_reset_zones(data, reset_list);
1035 }
1036
1037 list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
1038 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1039 break;
1040 set_current_state(TASK_RUNNING);
1041 xfs_zone_gc_finish_reset(chunk);
1042 }
1043
1044 list_for_each_entry_safe(chunk, next, &data->writing, entry) {
1045 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1046 break;
1047 set_current_state(TASK_RUNNING);
1048 xfs_zone_gc_finish_chunk(chunk);
1049 }
1050
1051 blk_start_plug(&plug);
1052 list_for_each_entry_safe(chunk, next, &data->reading, entry) {
1053 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1054 break;
1055 set_current_state(TASK_RUNNING);
1056 xfs_zone_gc_write_chunk(chunk);
1057 }
1058 blk_finish_plug(&plug);
1059
1060 if (xfs_zone_gc_should_start_new_work(data)) {
1061 set_current_state(TASK_RUNNING);
1062 blk_start_plug(&plug);
1063 while (xfs_zone_gc_start_chunk(data))
1064 ;
1065 blk_finish_plug(&plug);
1066 }
1067 }
1068
1069 /*
1070 * Note that the current GC algorithm would break reflinks and thus duplicate
1071 * data that was shared by multiple owners before. Because of that reflinks
1072 * are currently not supported on zoned file systems and can't be created or
1073 * mounted.
1074 */
1075 static int
xfs_zoned_gcd(void * private)1076 xfs_zoned_gcd(
1077 void *private)
1078 {
1079 struct xfs_zone_gc_data *data = private;
1080 struct xfs_mount *mp = data->mp;
1081 struct xfs_zone_info *zi = mp->m_zone_info;
1082 unsigned int nofs_flag;
1083
1084 nofs_flag = memalloc_nofs_save();
1085 set_freezable();
1086
1087 for (;;) {
1088 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1089 xfs_set_zonegc_running(mp);
1090
1091 xfs_zone_gc_handle_work(data);
1092
1093 /*
1094 * Only sleep if nothing set the state to running. Else check for
1095 * work again as someone might have queued up more work and woken
1096 * us in the meantime.
1097 */
1098 if (get_current_state() == TASK_RUNNING) {
1099 try_to_freeze();
1100 continue;
1101 }
1102
1103 if (list_empty(&data->reading) &&
1104 list_empty(&data->writing) &&
1105 list_empty(&data->resetting) &&
1106 !zi->zi_reset_list) {
1107 xfs_clear_zonegc_running(mp);
1108 xfs_zoned_resv_wake_all(mp);
1109
1110 if (kthread_should_stop()) {
1111 __set_current_state(TASK_RUNNING);
1112 break;
1113 }
1114
1115 if (kthread_should_park()) {
1116 __set_current_state(TASK_RUNNING);
1117 kthread_parkme();
1118 continue;
1119 }
1120 }
1121
1122 schedule();
1123 }
1124 xfs_clear_zonegc_running(mp);
1125
1126 if (data->iter.victim_rtg)
1127 xfs_rtgroup_rele(data->iter.victim_rtg);
1128
1129 memalloc_nofs_restore(nofs_flag);
1130 xfs_zone_gc_data_free(data);
1131 return 0;
1132 }
1133
1134 void
xfs_zone_gc_start(struct xfs_mount * mp)1135 xfs_zone_gc_start(
1136 struct xfs_mount *mp)
1137 {
1138 if (xfs_has_zoned(mp))
1139 kthread_unpark(mp->m_zone_info->zi_gc_thread);
1140 }
1141
1142 void
xfs_zone_gc_stop(struct xfs_mount * mp)1143 xfs_zone_gc_stop(
1144 struct xfs_mount *mp)
1145 {
1146 if (xfs_has_zoned(mp))
1147 kthread_park(mp->m_zone_info->zi_gc_thread);
1148 }
1149
1150 int
xfs_zone_gc_mount(struct xfs_mount * mp)1151 xfs_zone_gc_mount(
1152 struct xfs_mount *mp)
1153 {
1154 struct xfs_zone_info *zi = mp->m_zone_info;
1155 struct xfs_zone_gc_data *data;
1156 struct xfs_open_zone *oz;
1157 int error;
1158
1159 /*
1160 * If there are no free zones available for GC, pick the open zone with
1161 * the least used space to GC into. This should only happen after an
1162 * unclean shutdown near ENOSPC while GC was ongoing.
1163 *
1164 * We also need to do this for the first gc zone allocation if we
1165 * unmounted while at the open limit.
1166 */
1167 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
1168 zi->zi_nr_open_zones == mp->m_max_open_zones)
1169 oz = xfs_zone_gc_steal_open(zi);
1170 else
1171 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
1172 if (!oz) {
1173 xfs_warn(mp, "unable to allocate a zone for gc");
1174 error = -EIO;
1175 goto out;
1176 }
1177
1178 trace_xfs_zone_gc_target_opened(oz->oz_rtg);
1179 zi->zi_open_gc_zone = oz;
1180
1181 data = xfs_zone_gc_data_alloc(mp);
1182 if (!data) {
1183 error = -ENOMEM;
1184 goto out_put_gc_zone;
1185 }
1186
1187 mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
1188 "xfs-zone-gc/%s", mp->m_super->s_id);
1189 if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
1190 xfs_warn(mp, "unable to create zone gc thread");
1191 error = PTR_ERR(mp->m_zone_info->zi_gc_thread);
1192 goto out_free_gc_data;
1193 }
1194
1195 /* xfs_zone_gc_start will unpark for rw mounts */
1196 kthread_park(mp->m_zone_info->zi_gc_thread);
1197 return 0;
1198
1199 out_free_gc_data:
1200 kfree(data);
1201 out_put_gc_zone:
1202 xfs_open_zone_put(zi->zi_open_gc_zone);
1203 out:
1204 return error;
1205 }
1206
1207 void
xfs_zone_gc_unmount(struct xfs_mount * mp)1208 xfs_zone_gc_unmount(
1209 struct xfs_mount *mp)
1210 {
1211 struct xfs_zone_info *zi = mp->m_zone_info;
1212
1213 kthread_stop(zi->zi_gc_thread);
1214 if (zi->zi_open_gc_zone)
1215 xfs_open_zone_put(zi->zi_open_gc_zone);
1216 }
1217