1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2023-2025 Christoph Hellwig.
4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5 */
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "xfs_trans.h"
15 #include "xfs_icache.h"
16 #include "xfs_rmap.h"
17 #include "xfs_rtbitmap.h"
18 #include "xfs_rtrmap_btree.h"
19 #include "xfs_zone_alloc.h"
20 #include "xfs_zone_priv.h"
21 #include "xfs_zones.h"
22 #include "xfs_trace.h"
23
24 /*
25 * Implement Garbage Collection (GC) of partially used zoned.
26 *
27 * To support the purely sequential writes in each zone, zoned XFS needs to be
28 * able to move data remaining in a zone out of it to reset the zone to prepare
29 * for writing to it again.
30 *
31 * This is done by the GC thread implemented in this file. To support that a
32 * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
33 * write the garbage collected data into.
34 *
35 * Whenever the available space is below the chosen threshold, the GC thread
36 * looks for potential non-empty but not fully used zones that are worth
37 * reclaiming. Once found the rmap for the victim zone is queried, and after
38 * a bit of sorting to reduce fragmentation, the still live extents are read
39 * into memory and written to the GC target zone, and the bmap btree of the
40 * files is updated to point to the new location. To avoid taking the IOLOCK
41 * and MMAPLOCK for the entire GC process and thus affecting the latency of
42 * user reads and writes to the files, the GC writes are speculative and the
43 * I/O completion checks that no other writes happened for the affected regions
44 * before remapping.
45 *
46 * Once a zone does not contain any valid data, be that through GC or user
47 * block removal, it is queued for for a zone reset. The reset operation
48 * carefully ensures that the RT device cache is flushed and all transactions
49 * referencing the rmap have been committed to disk.
50 */
51
52 /*
53 * Size of each GC scratch pad. This is also the upper bound for each
54 * GC I/O, which helps to keep latency down.
55 */
56 #define XFS_GC_CHUNK_SIZE SZ_1M
57
58 /*
59 * Scratchpad data to read GCed data into.
60 *
61 * The offset member tracks where the next allocation starts, and freed tracks
62 * the amount of space that is not used anymore.
63 */
64 #define XFS_ZONE_GC_NR_SCRATCH 2
65 struct xfs_zone_scratch {
66 struct folio *folio;
67 unsigned int offset;
68 unsigned int freed;
69 };
70
71 /*
72 * Chunk that is read and written for each GC operation.
73 *
74 * Note that for writes to actual zoned devices, the chunk can be split when
75 * reaching the hardware limit.
76 */
77 struct xfs_gc_bio {
78 struct xfs_zone_gc_data *data;
79
80 /*
81 * Entry into the reading/writing/resetting list. Only accessed from
82 * the GC thread, so no locking needed.
83 */
84 struct list_head entry;
85
86 /*
87 * State of this gc_bio. Done means the current I/O completed.
88 * Set from the bio end I/O handler, read from the GC thread.
89 */
90 enum {
91 XFS_GC_BIO_NEW,
92 XFS_GC_BIO_DONE,
93 } state;
94
95 /*
96 * Pointer to the inode and byte range in the inode that this
97 * GC chunk is operating on.
98 */
99 struct xfs_inode *ip;
100 loff_t offset;
101 unsigned int len;
102
103 /*
104 * Existing startblock (in the zone to be freed) and newly assigned
105 * daddr in the zone GCed into.
106 */
107 xfs_fsblock_t old_startblock;
108 xfs_daddr_t new_daddr;
109 struct xfs_zone_scratch *scratch;
110
111 /* Are we writing to a sequential write required zone? */
112 bool is_seq;
113
114 /* Open Zone being written to */
115 struct xfs_open_zone *oz;
116
117 struct xfs_rtgroup *victim_rtg;
118
119 /* Bio used for reads and writes, including the bvec used by it */
120 struct bio bio; /* must be last */
121 };
122
123 #define XFS_ZONE_GC_RECS 1024
124
125 /* iterator, needs to be reinitialized for each victim zone */
126 struct xfs_zone_gc_iter {
127 struct xfs_rtgroup *victim_rtg;
128 unsigned int rec_count;
129 unsigned int rec_idx;
130 xfs_agblock_t next_startblock;
131 struct xfs_rmap_irec *recs;
132 };
133
134 /*
135 * Per-mount GC state.
136 */
137 struct xfs_zone_gc_data {
138 struct xfs_mount *mp;
139
140 /* bioset used to allocate the gc_bios */
141 struct bio_set bio_set;
142
143 /*
144 * Scratchpad used, and index to indicated which one is used.
145 */
146 struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH];
147 unsigned int scratch_idx;
148
149 /*
150 * List of bios currently being read, written and reset.
151 * These lists are only accessed by the GC thread itself, and must only
152 * be processed in order.
153 */
154 struct list_head reading;
155 struct list_head writing;
156 struct list_head resetting;
157
158 /*
159 * Iterator for the victim zone.
160 */
161 struct xfs_zone_gc_iter iter;
162 };
163
164 /*
165 * We aim to keep enough zones free in stock to fully use the open zone limit
166 * for data placement purposes. Additionally, the m_zonegc_low_space tunable
167 * can be set to make sure a fraction of the unused blocks are available for
168 * writing.
169 */
170 bool
xfs_zoned_need_gc(struct xfs_mount * mp)171 xfs_zoned_need_gc(
172 struct xfs_mount *mp)
173 {
174 s64 available, free, threshold;
175 s32 remainder;
176
177 if (!xfs_zoned_have_reclaimable(mp->m_zone_info))
178 return false;
179
180 available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
181
182 if (available <
183 xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
184 return true;
185
186 free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
187
188 threshold = div_s64_rem(free, 100, &remainder);
189 threshold = threshold * mp->m_zonegc_low_space +
190 remainder * div_s64(mp->m_zonegc_low_space, 100);
191
192 if (available < threshold)
193 return true;
194
195 return false;
196 }
197
198 static struct xfs_zone_gc_data *
xfs_zone_gc_data_alloc(struct xfs_mount * mp)199 xfs_zone_gc_data_alloc(
200 struct xfs_mount *mp)
201 {
202 struct xfs_zone_gc_data *data;
203 int i;
204
205 data = kzalloc(sizeof(*data), GFP_KERNEL);
206 if (!data)
207 return NULL;
208 data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
209 GFP_KERNEL);
210 if (!data->iter.recs)
211 goto out_free_data;
212
213 /*
214 * We actually only need a single bio_vec. It would be nice to have
215 * a flag that only allocates the inline bvecs and not the separate
216 * bvec pool.
217 */
218 if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
219 BIOSET_NEED_BVECS))
220 goto out_free_recs;
221 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
222 data->scratch[i].folio =
223 folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
224 if (!data->scratch[i].folio)
225 goto out_free_scratch;
226 }
227 INIT_LIST_HEAD(&data->reading);
228 INIT_LIST_HEAD(&data->writing);
229 INIT_LIST_HEAD(&data->resetting);
230 data->mp = mp;
231 return data;
232
233 out_free_scratch:
234 while (--i >= 0)
235 folio_put(data->scratch[i].folio);
236 bioset_exit(&data->bio_set);
237 out_free_recs:
238 kfree(data->iter.recs);
239 out_free_data:
240 kfree(data);
241 return NULL;
242 }
243
244 static void
xfs_zone_gc_data_free(struct xfs_zone_gc_data * data)245 xfs_zone_gc_data_free(
246 struct xfs_zone_gc_data *data)
247 {
248 int i;
249
250 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
251 folio_put(data->scratch[i].folio);
252 bioset_exit(&data->bio_set);
253 kfree(data->iter.recs);
254 kfree(data);
255 }
256
257 static void
xfs_zone_gc_iter_init(struct xfs_zone_gc_iter * iter,struct xfs_rtgroup * victim_rtg)258 xfs_zone_gc_iter_init(
259 struct xfs_zone_gc_iter *iter,
260 struct xfs_rtgroup *victim_rtg)
261
262 {
263 iter->next_startblock = 0;
264 iter->rec_count = 0;
265 iter->rec_idx = 0;
266 iter->victim_rtg = victim_rtg;
267 atomic_inc(&victim_rtg->rtg_gccount);
268 }
269
270 /*
271 * Query the rmap of the victim zone to gather the records to evacuate.
272 */
273 static int
xfs_zone_gc_query_cb(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * irec,void * private)274 xfs_zone_gc_query_cb(
275 struct xfs_btree_cur *cur,
276 const struct xfs_rmap_irec *irec,
277 void *private)
278 {
279 struct xfs_zone_gc_iter *iter = private;
280
281 ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
282 ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
283 ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
284
285 iter->recs[iter->rec_count] = *irec;
286 if (++iter->rec_count == XFS_ZONE_GC_RECS) {
287 iter->next_startblock =
288 irec->rm_startblock + irec->rm_blockcount;
289 return 1;
290 }
291 return 0;
292 }
293
294 static int
xfs_zone_gc_rmap_rec_cmp(const void * a,const void * b)295 xfs_zone_gc_rmap_rec_cmp(
296 const void *a,
297 const void *b)
298 {
299 const struct xfs_rmap_irec *reca = a;
300 const struct xfs_rmap_irec *recb = b;
301 int diff;
302
303 diff = cmp_int(reca->rm_owner, recb->rm_owner);
304 if (diff)
305 return diff;
306 return cmp_int(reca->rm_offset, recb->rm_offset);
307 }
308
309 static int
xfs_zone_gc_query(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter)310 xfs_zone_gc_query(
311 struct xfs_mount *mp,
312 struct xfs_zone_gc_iter *iter)
313 {
314 struct xfs_rtgroup *rtg = iter->victim_rtg;
315 struct xfs_rmap_irec ri_low = { };
316 struct xfs_rmap_irec ri_high;
317 struct xfs_btree_cur *cur;
318 struct xfs_trans *tp;
319 int error;
320
321 ASSERT(iter->next_startblock <= rtg_blocks(rtg));
322 if (iter->next_startblock == rtg_blocks(rtg))
323 goto done;
324
325 ASSERT(iter->next_startblock < rtg_blocks(rtg));
326 ri_low.rm_startblock = iter->next_startblock;
327 memset(&ri_high, 0xFF, sizeof(ri_high));
328
329 iter->rec_idx = 0;
330 iter->rec_count = 0;
331
332 tp = xfs_trans_alloc_empty(mp);
333 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
334 cur = xfs_rtrmapbt_init_cursor(tp, rtg);
335 error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
336 xfs_zone_gc_query_cb, iter);
337 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
338 xfs_btree_del_cursor(cur, error < 0 ? error : 0);
339 xfs_trans_cancel(tp);
340
341 if (error < 0)
342 return error;
343
344 /*
345 * Sort the rmap records by inode number and increasing offset to
346 * defragment the mappings.
347 *
348 * This could be further enhanced by an even bigger look ahead window,
349 * but that's better left until we have better detection of changes to
350 * inode mapping to avoid the potential of GCing already dead data.
351 */
352 sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
353 xfs_zone_gc_rmap_rec_cmp, NULL);
354
355 if (error == 0) {
356 /*
357 * We finished iterating through the zone.
358 */
359 iter->next_startblock = rtg_blocks(rtg);
360 if (iter->rec_count == 0)
361 goto done;
362 }
363
364 return 0;
365 done:
366 atomic_dec(&iter->victim_rtg->rtg_gccount);
367 xfs_rtgroup_rele(iter->victim_rtg);
368 iter->victim_rtg = NULL;
369 return 0;
370 }
371
372 static bool
xfs_zone_gc_iter_next(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter,struct xfs_rmap_irec * chunk_rec,struct xfs_inode ** ipp)373 xfs_zone_gc_iter_next(
374 struct xfs_mount *mp,
375 struct xfs_zone_gc_iter *iter,
376 struct xfs_rmap_irec *chunk_rec,
377 struct xfs_inode **ipp)
378 {
379 struct xfs_rmap_irec *irec;
380 int error;
381
382 if (!iter->victim_rtg)
383 return false;
384
385 retry:
386 if (iter->rec_idx == iter->rec_count) {
387 error = xfs_zone_gc_query(mp, iter);
388 if (error)
389 goto fail;
390 if (!iter->victim_rtg)
391 return false;
392 }
393
394 irec = &iter->recs[iter->rec_idx];
395 error = xfs_iget(mp, NULL, irec->rm_owner,
396 XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
397 if (error) {
398 /*
399 * If the inode was already deleted, skip over it.
400 */
401 if (error == -ENOENT) {
402 iter->rec_idx++;
403 goto retry;
404 }
405 goto fail;
406 }
407
408 if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
409 iter->rec_idx++;
410 xfs_irele(*ipp);
411 goto retry;
412 }
413
414 *chunk_rec = *irec;
415 return true;
416
417 fail:
418 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
419 return false;
420 }
421
422 static void
xfs_zone_gc_iter_advance(struct xfs_zone_gc_iter * iter,xfs_extlen_t count_fsb)423 xfs_zone_gc_iter_advance(
424 struct xfs_zone_gc_iter *iter,
425 xfs_extlen_t count_fsb)
426 {
427 struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx];
428
429 irec->rm_offset += count_fsb;
430 irec->rm_startblock += count_fsb;
431 irec->rm_blockcount -= count_fsb;
432 if (!irec->rm_blockcount)
433 iter->rec_idx++;
434 }
435
436 static struct xfs_rtgroup *
xfs_zone_gc_pick_victim_from(struct xfs_mount * mp,uint32_t bucket)437 xfs_zone_gc_pick_victim_from(
438 struct xfs_mount *mp,
439 uint32_t bucket)
440 {
441 struct xfs_zone_info *zi = mp->m_zone_info;
442 uint32_t victim_used = U32_MAX;
443 struct xfs_rtgroup *victim_rtg = NULL;
444 uint32_t bit;
445
446 if (!zi->zi_used_bucket_entries[bucket])
447 return NULL;
448
449 for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
450 mp->m_sb.sb_rgcount) {
451 struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
452
453 if (!rtg)
454 continue;
455
456 /*
457 * If the zone is already undergoing GC, don't pick it again.
458 *
459 * This prevents us from picking one of the zones for which we
460 * already submitted GC I/O, but for which the remapping hasn't
461 * concluded yet. This won't cause data corruption, but
462 * increases write amplification and slows down GC, so this is
463 * a bad thing.
464 */
465 if (atomic_read(&rtg->rtg_gccount)) {
466 xfs_rtgroup_rele(rtg);
467 continue;
468 }
469
470 /* skip zones that are just waiting for a reset */
471 if (rtg_rmap(rtg)->i_used_blocks == 0 ||
472 rtg_rmap(rtg)->i_used_blocks >= victim_used) {
473 xfs_rtgroup_rele(rtg);
474 continue;
475 }
476
477 if (victim_rtg)
478 xfs_rtgroup_rele(victim_rtg);
479 victim_rtg = rtg;
480 victim_used = rtg_rmap(rtg)->i_used_blocks;
481
482 /*
483 * Any zone that is less than 1 percent used is fair game for
484 * instant reclaim. All of these zones are in the last
485 * bucket, so avoid the expensive division for the zones
486 * in the other buckets.
487 */
488 if (bucket == 0 &&
489 rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
490 break;
491 }
492
493 return victim_rtg;
494 }
495
496 /*
497 * Iterate through all zones marked as reclaimable and find a candidate to
498 * reclaim.
499 */
500 static bool
xfs_zone_gc_select_victim(struct xfs_zone_gc_data * data)501 xfs_zone_gc_select_victim(
502 struct xfs_zone_gc_data *data)
503 {
504 struct xfs_zone_gc_iter *iter = &data->iter;
505 struct xfs_mount *mp = data->mp;
506 struct xfs_zone_info *zi = mp->m_zone_info;
507 struct xfs_rtgroup *victim_rtg = NULL;
508 unsigned int bucket;
509
510 spin_lock(&zi->zi_used_buckets_lock);
511 for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
512 victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
513 if (victim_rtg)
514 break;
515 }
516 spin_unlock(&zi->zi_used_buckets_lock);
517
518 if (!victim_rtg)
519 return false;
520
521 trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
522 xfs_zone_gc_iter_init(iter, victim_rtg);
523 return true;
524 }
525
526 static struct xfs_open_zone *
xfs_zone_gc_steal_open(struct xfs_zone_info * zi)527 xfs_zone_gc_steal_open(
528 struct xfs_zone_info *zi)
529 {
530 struct xfs_open_zone *oz, *found = NULL;
531
532 spin_lock(&zi->zi_open_zones_lock);
533 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
534 if (!found || oz->oz_allocated < found->oz_allocated)
535 found = oz;
536 }
537
538 if (found) {
539 found->oz_is_gc = true;
540 list_del_init(&found->oz_entry);
541 zi->zi_nr_open_zones--;
542 }
543
544 spin_unlock(&zi->zi_open_zones_lock);
545 return found;
546 }
547
548 static struct xfs_open_zone *
xfs_zone_gc_select_target(struct xfs_mount * mp)549 xfs_zone_gc_select_target(
550 struct xfs_mount *mp)
551 {
552 struct xfs_zone_info *zi = mp->m_zone_info;
553 struct xfs_open_zone *oz = zi->zi_open_gc_zone;
554
555 /*
556 * We need to wait for pending writes to finish.
557 */
558 if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
559 return NULL;
560
561 ASSERT(zi->zi_nr_open_zones <=
562 mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
563 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
564 if (oz)
565 trace_xfs_zone_gc_target_opened(oz->oz_rtg);
566 spin_lock(&zi->zi_open_zones_lock);
567 zi->zi_open_gc_zone = oz;
568 spin_unlock(&zi->zi_open_zones_lock);
569 return oz;
570 }
571
572 /*
573 * Ensure we have a valid open zone to write the GC data to.
574 *
575 * If the current target zone has space keep writing to it, else first wait for
576 * all pending writes and then pick a new one.
577 */
578 static struct xfs_open_zone *
xfs_zone_gc_ensure_target(struct xfs_mount * mp)579 xfs_zone_gc_ensure_target(
580 struct xfs_mount *mp)
581 {
582 struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone;
583
584 if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
585 return xfs_zone_gc_select_target(mp);
586 return oz;
587 }
588
589 static unsigned int
xfs_zone_gc_scratch_available(struct xfs_zone_gc_data * data)590 xfs_zone_gc_scratch_available(
591 struct xfs_zone_gc_data *data)
592 {
593 return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
594 }
595
596 static bool
xfs_zone_gc_space_available(struct xfs_zone_gc_data * data)597 xfs_zone_gc_space_available(
598 struct xfs_zone_gc_data *data)
599 {
600 struct xfs_open_zone *oz;
601
602 oz = xfs_zone_gc_ensure_target(data->mp);
603 if (!oz)
604 return false;
605 return oz->oz_allocated < rtg_blocks(oz->oz_rtg) &&
606 xfs_zone_gc_scratch_available(data);
607 }
608
609 static void
xfs_zone_gc_end_io(struct bio * bio)610 xfs_zone_gc_end_io(
611 struct bio *bio)
612 {
613 struct xfs_gc_bio *chunk =
614 container_of(bio, struct xfs_gc_bio, bio);
615 struct xfs_zone_gc_data *data = chunk->data;
616
617 WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
618 wake_up_process(data->mp->m_zone_info->zi_gc_thread);
619 }
620
621 static struct xfs_open_zone *
xfs_zone_gc_alloc_blocks(struct xfs_zone_gc_data * data,xfs_extlen_t * count_fsb,xfs_daddr_t * daddr,bool * is_seq)622 xfs_zone_gc_alloc_blocks(
623 struct xfs_zone_gc_data *data,
624 xfs_extlen_t *count_fsb,
625 xfs_daddr_t *daddr,
626 bool *is_seq)
627 {
628 struct xfs_mount *mp = data->mp;
629 struct xfs_open_zone *oz;
630
631 oz = xfs_zone_gc_ensure_target(mp);
632 if (!oz)
633 return NULL;
634
635 *count_fsb = min(*count_fsb,
636 XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
637
638 /*
639 * Directly allocate GC blocks from the reserved pool.
640 *
641 * If we'd take them from the normal pool we could be stealing blocks
642 * from a regular writer, which would then have to wait for GC and
643 * deadlock.
644 */
645 spin_lock(&mp->m_sb_lock);
646 *count_fsb = min(*count_fsb,
647 rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
648 *count_fsb = min3(*count_fsb,
649 mp->m_free[XC_FREE_RTEXTENTS].res_avail,
650 mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
651 mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
652 mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
653 spin_unlock(&mp->m_sb_lock);
654
655 if (!*count_fsb)
656 return NULL;
657
658 *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
659 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
660 if (!*is_seq)
661 *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
662 oz->oz_allocated += *count_fsb;
663 atomic_inc(&oz->oz_ref);
664 return oz;
665 }
666
667 static bool
xfs_zone_gc_start_chunk(struct xfs_zone_gc_data * data)668 xfs_zone_gc_start_chunk(
669 struct xfs_zone_gc_data *data)
670 {
671 struct xfs_zone_gc_iter *iter = &data->iter;
672 struct xfs_mount *mp = data->mp;
673 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
674 struct xfs_open_zone *oz;
675 struct xfs_rmap_irec irec;
676 struct xfs_gc_bio *chunk;
677 struct xfs_inode *ip;
678 struct bio *bio;
679 xfs_daddr_t daddr;
680 bool is_seq;
681
682 if (xfs_is_shutdown(mp))
683 return false;
684
685 if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
686 return false;
687 oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
688 &is_seq);
689 if (!oz) {
690 xfs_irele(ip);
691 return false;
692 }
693
694 bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
695
696 chunk = container_of(bio, struct xfs_gc_bio, bio);
697 chunk->ip = ip;
698 chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
699 chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
700 chunk->old_startblock =
701 xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
702 chunk->new_daddr = daddr;
703 chunk->is_seq = is_seq;
704 chunk->scratch = &data->scratch[data->scratch_idx];
705 chunk->data = data;
706 chunk->oz = oz;
707 chunk->victim_rtg = iter->victim_rtg;
708 atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
709 atomic_inc(&chunk->victim_rtg->rtg_gccount);
710
711 bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
712 bio->bi_end_io = xfs_zone_gc_end_io;
713 bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
714 chunk->scratch->offset);
715 chunk->scratch->offset += chunk->len;
716 if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
717 data->scratch_idx =
718 (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
719 }
720 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
721 list_add_tail(&chunk->entry, &data->reading);
722 xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
723
724 submit_bio(bio);
725 return true;
726 }
727
728 static void
xfs_zone_gc_free_chunk(struct xfs_gc_bio * chunk)729 xfs_zone_gc_free_chunk(
730 struct xfs_gc_bio *chunk)
731 {
732 atomic_dec(&chunk->victim_rtg->rtg_gccount);
733 xfs_rtgroup_rele(chunk->victim_rtg);
734 list_del(&chunk->entry);
735 xfs_open_zone_put(chunk->oz);
736 xfs_irele(chunk->ip);
737 bio_put(&chunk->bio);
738 }
739
740 static void
xfs_zone_gc_submit_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)741 xfs_zone_gc_submit_write(
742 struct xfs_zone_gc_data *data,
743 struct xfs_gc_bio *chunk)
744 {
745 if (chunk->is_seq) {
746 chunk->bio.bi_opf &= ~REQ_OP_WRITE;
747 chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
748 }
749 chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
750 chunk->bio.bi_end_io = xfs_zone_gc_end_io;
751 submit_bio(&chunk->bio);
752 }
753
754 static struct xfs_gc_bio *
xfs_zone_gc_split_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)755 xfs_zone_gc_split_write(
756 struct xfs_zone_gc_data *data,
757 struct xfs_gc_bio *chunk)
758 {
759 struct queue_limits *lim =
760 &bdev_get_queue(chunk->bio.bi_bdev)->limits;
761 struct xfs_gc_bio *split_chunk;
762 int split_sectors;
763 unsigned int split_len;
764 struct bio *split;
765 unsigned int nsegs;
766
767 if (!chunk->is_seq)
768 return NULL;
769
770 split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
771 lim->max_zone_append_sectors << SECTOR_SHIFT);
772 if (!split_sectors)
773 return NULL;
774
775 /* ensure the split chunk is still block size aligned */
776 split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
777 data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
778 split_len = split_sectors << SECTOR_SHIFT;
779
780 split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
781 split_chunk = container_of(split, struct xfs_gc_bio, bio);
782 split_chunk->data = data;
783 ihold(VFS_I(chunk->ip));
784 split_chunk->ip = chunk->ip;
785 split_chunk->is_seq = chunk->is_seq;
786 split_chunk->scratch = chunk->scratch;
787 split_chunk->offset = chunk->offset;
788 split_chunk->len = split_len;
789 split_chunk->old_startblock = chunk->old_startblock;
790 split_chunk->new_daddr = chunk->new_daddr;
791 split_chunk->oz = chunk->oz;
792 atomic_inc(&chunk->oz->oz_ref);
793
794 split_chunk->victim_rtg = chunk->victim_rtg;
795 atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
796 atomic_inc(&chunk->victim_rtg->rtg_gccount);
797
798 chunk->offset += split_len;
799 chunk->len -= split_len;
800 chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
801
802 /* add right before the original chunk */
803 WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
804 list_add_tail(&split_chunk->entry, &chunk->entry);
805 return split_chunk;
806 }
807
808 static void
xfs_zone_gc_write_chunk(struct xfs_gc_bio * chunk)809 xfs_zone_gc_write_chunk(
810 struct xfs_gc_bio *chunk)
811 {
812 struct xfs_zone_gc_data *data = chunk->data;
813 struct xfs_mount *mp = chunk->ip->i_mount;
814 phys_addr_t bvec_paddr =
815 bvec_phys(bio_first_bvec_all(&chunk->bio));
816 struct xfs_gc_bio *split_chunk;
817
818 if (chunk->bio.bi_status)
819 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
820 if (xfs_is_shutdown(mp)) {
821 xfs_zone_gc_free_chunk(chunk);
822 return;
823 }
824
825 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
826 list_move_tail(&chunk->entry, &data->writing);
827
828 bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
829 bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
830 offset_in_folio(chunk->scratch->folio, bvec_paddr));
831
832 while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
833 xfs_zone_gc_submit_write(data, split_chunk);
834 xfs_zone_gc_submit_write(data, chunk);
835 }
836
837 static void
xfs_zone_gc_finish_chunk(struct xfs_gc_bio * chunk)838 xfs_zone_gc_finish_chunk(
839 struct xfs_gc_bio *chunk)
840 {
841 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
842 struct xfs_inode *ip = chunk->ip;
843 struct xfs_mount *mp = ip->i_mount;
844 int error;
845
846 if (chunk->bio.bi_status)
847 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
848 if (xfs_is_shutdown(mp)) {
849 xfs_zone_gc_free_chunk(chunk);
850 return;
851 }
852
853 chunk->scratch->freed += chunk->len;
854 if (chunk->scratch->freed == chunk->scratch->offset) {
855 chunk->scratch->offset = 0;
856 chunk->scratch->freed = 0;
857 }
858
859 /*
860 * Cycle through the iolock and wait for direct I/O and layouts to
861 * ensure no one is reading from the old mapping before it goes away.
862 *
863 * Note that xfs_zoned_end_io() below checks that no other writer raced
864 * with us to update the mapping by checking that the old startblock
865 * didn't change.
866 */
867 xfs_ilock(ip, iolock);
868 error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
869 if (!error)
870 inode_dio_wait(VFS_I(ip));
871 xfs_iunlock(ip, iolock);
872 if (error)
873 goto free;
874
875 if (chunk->is_seq)
876 chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
877 error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
878 chunk->new_daddr, chunk->oz, chunk->old_startblock);
879 free:
880 if (error)
881 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
882 xfs_zone_gc_free_chunk(chunk);
883 }
884
885 static void
xfs_zone_gc_finish_reset(struct xfs_gc_bio * chunk)886 xfs_zone_gc_finish_reset(
887 struct xfs_gc_bio *chunk)
888 {
889 struct xfs_rtgroup *rtg = chunk->bio.bi_private;
890 struct xfs_mount *mp = rtg_mount(rtg);
891 struct xfs_zone_info *zi = mp->m_zone_info;
892
893 if (chunk->bio.bi_status) {
894 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
895 goto out;
896 }
897
898 xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
899 atomic_inc(&zi->zi_nr_free_zones);
900
901 xfs_zoned_add_available(mp, rtg_blocks(rtg));
902
903 wake_up_all(&zi->zi_zone_wait);
904 out:
905 list_del(&chunk->entry);
906 bio_put(&chunk->bio);
907 }
908
909 static bool
xfs_zone_gc_prepare_reset(struct bio * bio,struct xfs_rtgroup * rtg)910 xfs_zone_gc_prepare_reset(
911 struct bio *bio,
912 struct xfs_rtgroup *rtg)
913 {
914 trace_xfs_zone_reset(rtg);
915
916 ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
917 bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
918 if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
919 if (!bdev_max_discard_sectors(bio->bi_bdev))
920 return false;
921 bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
922 bio->bi_iter.bi_size =
923 XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
924 }
925
926 return true;
927 }
928
929 int
xfs_zone_gc_reset_sync(struct xfs_rtgroup * rtg)930 xfs_zone_gc_reset_sync(
931 struct xfs_rtgroup *rtg)
932 {
933 int error = 0;
934 struct bio bio;
935
936 bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
937 REQ_OP_ZONE_RESET);
938 if (xfs_zone_gc_prepare_reset(&bio, rtg))
939 error = submit_bio_wait(&bio);
940 bio_uninit(&bio);
941
942 return error;
943 }
944
945 static void
xfs_zone_gc_reset_zones(struct xfs_zone_gc_data * data,struct xfs_group * reset_list)946 xfs_zone_gc_reset_zones(
947 struct xfs_zone_gc_data *data,
948 struct xfs_group *reset_list)
949 {
950 struct xfs_group *next = reset_list;
951
952 if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
953 xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
954 return;
955 }
956
957 do {
958 struct xfs_rtgroup *rtg = to_rtg(next);
959 struct xfs_gc_bio *chunk;
960 struct bio *bio;
961
962 xfs_log_force_inode(rtg_rmap(rtg));
963
964 next = rtg_group(rtg)->xg_next_reset;
965 rtg_group(rtg)->xg_next_reset = NULL;
966
967 bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
968 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
969 bio->bi_private = rtg;
970 bio->bi_end_io = xfs_zone_gc_end_io;
971
972 chunk = container_of(bio, struct xfs_gc_bio, bio);
973 chunk->data = data;
974 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
975 list_add_tail(&chunk->entry, &data->resetting);
976
977 /*
978 * Also use the bio to drive the state machine when neither
979 * zone reset nor discard is supported to keep things simple.
980 */
981 if (xfs_zone_gc_prepare_reset(bio, rtg))
982 submit_bio(bio);
983 else
984 bio_endio(bio);
985 } while (next);
986 }
987
988 static bool
xfs_zone_gc_should_start_new_work(struct xfs_zone_gc_data * data)989 xfs_zone_gc_should_start_new_work(
990 struct xfs_zone_gc_data *data)
991 {
992 if (xfs_is_shutdown(data->mp))
993 return false;
994 if (!xfs_zone_gc_space_available(data))
995 return false;
996
997 if (!data->iter.victim_rtg) {
998 if (kthread_should_stop() || kthread_should_park())
999 return false;
1000 if (!xfs_zoned_need_gc(data->mp))
1001 return false;
1002 if (!xfs_zone_gc_select_victim(data))
1003 return false;
1004 }
1005
1006 return true;
1007 }
1008
1009 /*
1010 * Handle the work to read and write data for GC and to reset the zones,
1011 * including handling all completions.
1012 *
1013 * Note that the order of the chunks is preserved so that we don't undo the
1014 * optimal order established by xfs_zone_gc_query().
1015 */
1016 static void
xfs_zone_gc_handle_work(struct xfs_zone_gc_data * data)1017 xfs_zone_gc_handle_work(
1018 struct xfs_zone_gc_data *data)
1019 {
1020 struct xfs_zone_info *zi = data->mp->m_zone_info;
1021 struct xfs_gc_bio *chunk, *next;
1022 struct xfs_group *reset_list;
1023 struct blk_plug plug;
1024
1025 spin_lock(&zi->zi_reset_list_lock);
1026 reset_list = zi->zi_reset_list;
1027 zi->zi_reset_list = NULL;
1028 spin_unlock(&zi->zi_reset_list_lock);
1029
1030 if (reset_list) {
1031 set_current_state(TASK_RUNNING);
1032 xfs_zone_gc_reset_zones(data, reset_list);
1033 }
1034
1035 list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
1036 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1037 break;
1038 set_current_state(TASK_RUNNING);
1039 xfs_zone_gc_finish_reset(chunk);
1040 }
1041
1042 list_for_each_entry_safe(chunk, next, &data->writing, entry) {
1043 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1044 break;
1045 set_current_state(TASK_RUNNING);
1046 xfs_zone_gc_finish_chunk(chunk);
1047 }
1048
1049 blk_start_plug(&plug);
1050 list_for_each_entry_safe(chunk, next, &data->reading, entry) {
1051 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1052 break;
1053 set_current_state(TASK_RUNNING);
1054 xfs_zone_gc_write_chunk(chunk);
1055 }
1056 blk_finish_plug(&plug);
1057
1058 if (xfs_zone_gc_should_start_new_work(data)) {
1059 set_current_state(TASK_RUNNING);
1060 blk_start_plug(&plug);
1061 while (xfs_zone_gc_start_chunk(data))
1062 ;
1063 blk_finish_plug(&plug);
1064 }
1065 }
1066
1067 /*
1068 * Note that the current GC algorithm would break reflinks and thus duplicate
1069 * data that was shared by multiple owners before. Because of that reflinks
1070 * are currently not supported on zoned file systems and can't be created or
1071 * mounted.
1072 */
1073 static int
xfs_zoned_gcd(void * private)1074 xfs_zoned_gcd(
1075 void *private)
1076 {
1077 struct xfs_zone_gc_data *data = private;
1078 struct xfs_mount *mp = data->mp;
1079 struct xfs_zone_info *zi = mp->m_zone_info;
1080 unsigned int nofs_flag;
1081
1082 nofs_flag = memalloc_nofs_save();
1083 set_freezable();
1084
1085 for (;;) {
1086 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1087 xfs_set_zonegc_running(mp);
1088
1089 xfs_zone_gc_handle_work(data);
1090
1091 /*
1092 * Only sleep if nothing set the state to running. Else check for
1093 * work again as someone might have queued up more work and woken
1094 * us in the meantime.
1095 */
1096 if (get_current_state() == TASK_RUNNING) {
1097 try_to_freeze();
1098 continue;
1099 }
1100
1101 if (list_empty(&data->reading) &&
1102 list_empty(&data->writing) &&
1103 list_empty(&data->resetting) &&
1104 !zi->zi_reset_list) {
1105 xfs_clear_zonegc_running(mp);
1106 xfs_zoned_resv_wake_all(mp);
1107
1108 if (kthread_should_stop()) {
1109 __set_current_state(TASK_RUNNING);
1110 break;
1111 }
1112
1113 if (kthread_should_park()) {
1114 __set_current_state(TASK_RUNNING);
1115 kthread_parkme();
1116 continue;
1117 }
1118 }
1119
1120 schedule();
1121 }
1122 xfs_clear_zonegc_running(mp);
1123
1124 if (data->iter.victim_rtg)
1125 xfs_rtgroup_rele(data->iter.victim_rtg);
1126
1127 memalloc_nofs_restore(nofs_flag);
1128 xfs_zone_gc_data_free(data);
1129 return 0;
1130 }
1131
1132 void
xfs_zone_gc_start(struct xfs_mount * mp)1133 xfs_zone_gc_start(
1134 struct xfs_mount *mp)
1135 {
1136 if (xfs_has_zoned(mp))
1137 kthread_unpark(mp->m_zone_info->zi_gc_thread);
1138 }
1139
1140 void
xfs_zone_gc_stop(struct xfs_mount * mp)1141 xfs_zone_gc_stop(
1142 struct xfs_mount *mp)
1143 {
1144 if (xfs_has_zoned(mp))
1145 kthread_park(mp->m_zone_info->zi_gc_thread);
1146 }
1147
1148 int
xfs_zone_gc_mount(struct xfs_mount * mp)1149 xfs_zone_gc_mount(
1150 struct xfs_mount *mp)
1151 {
1152 struct xfs_zone_info *zi = mp->m_zone_info;
1153 struct xfs_zone_gc_data *data;
1154 struct xfs_open_zone *oz;
1155 int error;
1156
1157 /*
1158 * If there are no free zones available for GC, pick the open zone with
1159 * the least used space to GC into. This should only happen after an
1160 * unclean shutdown near ENOSPC while GC was ongoing.
1161 *
1162 * We also need to do this for the first gc zone allocation if we
1163 * unmounted while at the open limit.
1164 */
1165 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
1166 zi->zi_nr_open_zones == mp->m_max_open_zones)
1167 oz = xfs_zone_gc_steal_open(zi);
1168 else
1169 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
1170 if (!oz) {
1171 xfs_warn(mp, "unable to allocate a zone for gc");
1172 error = -EIO;
1173 goto out;
1174 }
1175
1176 trace_xfs_zone_gc_target_opened(oz->oz_rtg);
1177 zi->zi_open_gc_zone = oz;
1178
1179 data = xfs_zone_gc_data_alloc(mp);
1180 if (!data) {
1181 error = -ENOMEM;
1182 goto out_put_gc_zone;
1183 }
1184
1185 zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
1186 "xfs-zone-gc/%s", mp->m_super->s_id);
1187 if (IS_ERR(zi->zi_gc_thread)) {
1188 xfs_warn(mp, "unable to create zone gc thread");
1189 error = PTR_ERR(zi->zi_gc_thread);
1190 goto out_free_gc_data;
1191 }
1192
1193 /* xfs_zone_gc_start will unpark for rw mounts */
1194 kthread_park(zi->zi_gc_thread);
1195 return 0;
1196
1197 out_free_gc_data:
1198 kfree(data);
1199 out_put_gc_zone:
1200 xfs_open_zone_put(zi->zi_open_gc_zone);
1201 out:
1202 return error;
1203 }
1204
1205 void
xfs_zone_gc_unmount(struct xfs_mount * mp)1206 xfs_zone_gc_unmount(
1207 struct xfs_mount *mp)
1208 {
1209 struct xfs_zone_info *zi = mp->m_zone_info;
1210
1211 kthread_stop(zi->zi_gc_thread);
1212 if (zi->zi_open_gc_zone)
1213 xfs_open_zone_put(zi->zi_open_gc_zone);
1214 }
1215