1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2023-2025 Christoph Hellwig.
4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5 */
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "xfs_trans.h"
15 #include "xfs_icache.h"
16 #include "xfs_rmap.h"
17 #include "xfs_rtbitmap.h"
18 #include "xfs_rtrmap_btree.h"
19 #include "xfs_zone_alloc.h"
20 #include "xfs_zone_priv.h"
21 #include "xfs_zones.h"
22 #include "xfs_trace.h"
23
24 /*
25 * Implement Garbage Collection (GC) of partially used zoned.
26 *
27 * To support the purely sequential writes in each zone, zoned XFS needs to be
28 * able to move data remaining in a zone out of it to reset the zone to prepare
29 * for writing to it again.
30 *
31 * This is done by the GC thread implemented in this file. To support that a
32 * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
33 * write the garbage collected data into.
34 *
35 * Whenever the available space is below the chosen threshold, the GC thread
36 * looks for potential non-empty but not fully used zones that are worth
37 * reclaiming. Once found the rmap for the victim zone is queried, and after
38 * a bit of sorting to reduce fragmentation, the still live extents are read
39 * into memory and written to the GC target zone, and the bmap btree of the
40 * files is updated to point to the new location. To avoid taking the IOLOCK
41 * and MMAPLOCK for the entire GC process and thus affecting the latency of
42 * user reads and writes to the files, the GC writes are speculative and the
43 * I/O completion checks that no other writes happened for the affected regions
44 * before remapping.
45 *
46 * Once a zone does not contain any valid data, be that through GC or user
47 * block removal, it is queued for for a zone reset. The reset operation
48 * carefully ensures that the RT device cache is flushed and all transactions
49 * referencing the rmap have been committed to disk.
50 */
51
52 /*
53 * Size of each GC scratch pad. This is also the upper bound for each
54 * GC I/O, which helps to keep latency down.
55 */
56 #define XFS_GC_CHUNK_SIZE SZ_1M
57
58 /*
59 * Scratchpad data to read GCed data into.
60 *
61 * The offset member tracks where the next allocation starts, and freed tracks
62 * the amount of space that is not used anymore.
63 */
64 #define XFS_ZONE_GC_NR_SCRATCH 2
65 struct xfs_zone_scratch {
66 struct folio *folio;
67 unsigned int offset;
68 unsigned int freed;
69 };
70
71 /*
72 * Chunk that is read and written for each GC operation.
73 *
74 * Note that for writes to actual zoned devices, the chunk can be split when
75 * reaching the hardware limit.
76 */
77 struct xfs_gc_bio {
78 struct xfs_zone_gc_data *data;
79
80 /*
81 * Entry into the reading/writing/resetting list. Only accessed from
82 * the GC thread, so no locking needed.
83 */
84 struct list_head entry;
85
86 /*
87 * State of this gc_bio. Done means the current I/O completed.
88 * Set from the bio end I/O handler, read from the GC thread.
89 */
90 enum {
91 XFS_GC_BIO_NEW,
92 XFS_GC_BIO_DONE,
93 } state;
94
95 /*
96 * Pointer to the inode and byte range in the inode that this
97 * GC chunk is operating on.
98 */
99 struct xfs_inode *ip;
100 loff_t offset;
101 unsigned int len;
102
103 /*
104 * Existing startblock (in the zone to be freed) and newly assigned
105 * daddr in the zone GCed into.
106 */
107 xfs_fsblock_t old_startblock;
108 xfs_daddr_t new_daddr;
109 struct xfs_zone_scratch *scratch;
110
111 /* Are we writing to a sequential write required zone? */
112 bool is_seq;
113
114 /* Open Zone being written to */
115 struct xfs_open_zone *oz;
116
117 /* Bio used for reads and writes, including the bvec used by it */
118 struct bio_vec bv;
119 struct bio bio; /* must be last */
120 };
121
122 #define XFS_ZONE_GC_RECS 1024
123
124 /* iterator, needs to be reinitialized for each victim zone */
125 struct xfs_zone_gc_iter {
126 struct xfs_rtgroup *victim_rtg;
127 unsigned int rec_count;
128 unsigned int rec_idx;
129 xfs_agblock_t next_startblock;
130 struct xfs_rmap_irec *recs;
131 };
132
133 /*
134 * Per-mount GC state.
135 */
136 struct xfs_zone_gc_data {
137 struct xfs_mount *mp;
138
139 /* bioset used to allocate the gc_bios */
140 struct bio_set bio_set;
141
142 /*
143 * Scratchpad used, and index to indicated which one is used.
144 */
145 struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH];
146 unsigned int scratch_idx;
147
148 /*
149 * List of bios currently being read, written and reset.
150 * These lists are only accessed by the GC thread itself, and must only
151 * be processed in order.
152 */
153 struct list_head reading;
154 struct list_head writing;
155 struct list_head resetting;
156
157 /*
158 * Iterator for the victim zone.
159 */
160 struct xfs_zone_gc_iter iter;
161 };
162
163 /*
164 * We aim to keep enough zones free in stock to fully use the open zone limit
165 * for data placement purposes. Additionally, the m_zonegc_low_space tunable
166 * can be set to make sure a fraction of the unused blocks are available for
167 * writing.
168 */
169 bool
xfs_zoned_need_gc(struct xfs_mount * mp)170 xfs_zoned_need_gc(
171 struct xfs_mount *mp)
172 {
173 s64 available, free, threshold;
174 s32 remainder;
175
176 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
177 return false;
178
179 available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
180
181 if (available <
182 mp->m_groups[XG_TYPE_RTG].blocks *
183 (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
184 return true;
185
186 free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
187
188 threshold = div_s64_rem(free, 100, &remainder);
189 threshold = threshold * mp->m_zonegc_low_space +
190 remainder * div_s64(mp->m_zonegc_low_space, 100);
191
192 if (available < threshold)
193 return true;
194
195 return false;
196 }
197
198 static struct xfs_zone_gc_data *
xfs_zone_gc_data_alloc(struct xfs_mount * mp)199 xfs_zone_gc_data_alloc(
200 struct xfs_mount *mp)
201 {
202 struct xfs_zone_gc_data *data;
203 int i;
204
205 data = kzalloc(sizeof(*data), GFP_KERNEL);
206 if (!data)
207 return NULL;
208 data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
209 GFP_KERNEL);
210 if (!data->iter.recs)
211 goto out_free_data;
212
213 /*
214 * We actually only need a single bio_vec. It would be nice to have
215 * a flag that only allocates the inline bvecs and not the separate
216 * bvec pool.
217 */
218 if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
219 BIOSET_NEED_BVECS))
220 goto out_free_recs;
221 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
222 data->scratch[i].folio =
223 folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
224 if (!data->scratch[i].folio)
225 goto out_free_scratch;
226 }
227 INIT_LIST_HEAD(&data->reading);
228 INIT_LIST_HEAD(&data->writing);
229 INIT_LIST_HEAD(&data->resetting);
230 data->mp = mp;
231 return data;
232
233 out_free_scratch:
234 while (--i >= 0)
235 folio_put(data->scratch[i].folio);
236 bioset_exit(&data->bio_set);
237 out_free_recs:
238 kfree(data->iter.recs);
239 out_free_data:
240 kfree(data);
241 return NULL;
242 }
243
244 static void
xfs_zone_gc_data_free(struct xfs_zone_gc_data * data)245 xfs_zone_gc_data_free(
246 struct xfs_zone_gc_data *data)
247 {
248 int i;
249
250 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
251 folio_put(data->scratch[i].folio);
252 bioset_exit(&data->bio_set);
253 kfree(data->iter.recs);
254 kfree(data);
255 }
256
257 static void
xfs_zone_gc_iter_init(struct xfs_zone_gc_iter * iter,struct xfs_rtgroup * victim_rtg)258 xfs_zone_gc_iter_init(
259 struct xfs_zone_gc_iter *iter,
260 struct xfs_rtgroup *victim_rtg)
261
262 {
263 iter->next_startblock = 0;
264 iter->rec_count = 0;
265 iter->rec_idx = 0;
266 iter->victim_rtg = victim_rtg;
267 }
268
269 /*
270 * Query the rmap of the victim zone to gather the records to evacuate.
271 */
272 static int
xfs_zone_gc_query_cb(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * irec,void * private)273 xfs_zone_gc_query_cb(
274 struct xfs_btree_cur *cur,
275 const struct xfs_rmap_irec *irec,
276 void *private)
277 {
278 struct xfs_zone_gc_iter *iter = private;
279
280 ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
281 ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
282 ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
283
284 iter->recs[iter->rec_count] = *irec;
285 if (++iter->rec_count == XFS_ZONE_GC_RECS) {
286 iter->next_startblock =
287 irec->rm_startblock + irec->rm_blockcount;
288 return 1;
289 }
290 return 0;
291 }
292
293 static int
xfs_zone_gc_rmap_rec_cmp(const void * a,const void * b)294 xfs_zone_gc_rmap_rec_cmp(
295 const void *a,
296 const void *b)
297 {
298 const struct xfs_rmap_irec *reca = a;
299 const struct xfs_rmap_irec *recb = b;
300 int diff;
301
302 diff = cmp_int(reca->rm_owner, recb->rm_owner);
303 if (diff)
304 return diff;
305 return cmp_int(reca->rm_offset, recb->rm_offset);
306 }
307
308 static int
xfs_zone_gc_query(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter)309 xfs_zone_gc_query(
310 struct xfs_mount *mp,
311 struct xfs_zone_gc_iter *iter)
312 {
313 struct xfs_rtgroup *rtg = iter->victim_rtg;
314 struct xfs_rmap_irec ri_low = { };
315 struct xfs_rmap_irec ri_high;
316 struct xfs_btree_cur *cur;
317 struct xfs_trans *tp;
318 int error;
319
320 ASSERT(iter->next_startblock <= rtg_blocks(rtg));
321 if (iter->next_startblock == rtg_blocks(rtg))
322 goto done;
323
324 ASSERT(iter->next_startblock < rtg_blocks(rtg));
325 ri_low.rm_startblock = iter->next_startblock;
326 memset(&ri_high, 0xFF, sizeof(ri_high));
327
328 iter->rec_idx = 0;
329 iter->rec_count = 0;
330
331 tp = xfs_trans_alloc_empty(mp);
332 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
333 cur = xfs_rtrmapbt_init_cursor(tp, rtg);
334 error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
335 xfs_zone_gc_query_cb, iter);
336 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
337 xfs_btree_del_cursor(cur, error < 0 ? error : 0);
338 xfs_trans_cancel(tp);
339
340 if (error < 0)
341 return error;
342
343 /*
344 * Sort the rmap records by inode number and increasing offset to
345 * defragment the mappings.
346 *
347 * This could be further enhanced by an even bigger look ahead window,
348 * but that's better left until we have better detection of changes to
349 * inode mapping to avoid the potential of GCing already dead data.
350 */
351 sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
352 xfs_zone_gc_rmap_rec_cmp, NULL);
353
354 if (error == 0) {
355 /*
356 * We finished iterating through the zone.
357 */
358 iter->next_startblock = rtg_blocks(rtg);
359 if (iter->rec_count == 0)
360 goto done;
361 }
362
363 return 0;
364 done:
365 xfs_rtgroup_rele(iter->victim_rtg);
366 iter->victim_rtg = NULL;
367 return 0;
368 }
369
370 static bool
xfs_zone_gc_iter_next(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter,struct xfs_rmap_irec * chunk_rec,struct xfs_inode ** ipp)371 xfs_zone_gc_iter_next(
372 struct xfs_mount *mp,
373 struct xfs_zone_gc_iter *iter,
374 struct xfs_rmap_irec *chunk_rec,
375 struct xfs_inode **ipp)
376 {
377 struct xfs_rmap_irec *irec;
378 int error;
379
380 if (!iter->victim_rtg)
381 return false;
382
383 retry:
384 if (iter->rec_idx == iter->rec_count) {
385 error = xfs_zone_gc_query(mp, iter);
386 if (error)
387 goto fail;
388 if (!iter->victim_rtg)
389 return false;
390 }
391
392 irec = &iter->recs[iter->rec_idx];
393 error = xfs_iget(mp, NULL, irec->rm_owner,
394 XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
395 if (error) {
396 /*
397 * If the inode was already deleted, skip over it.
398 */
399 if (error == -ENOENT) {
400 iter->rec_idx++;
401 goto retry;
402 }
403 goto fail;
404 }
405
406 if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
407 iter->rec_idx++;
408 xfs_irele(*ipp);
409 goto retry;
410 }
411
412 *chunk_rec = *irec;
413 return true;
414
415 fail:
416 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
417 return false;
418 }
419
420 static void
xfs_zone_gc_iter_advance(struct xfs_zone_gc_iter * iter,xfs_extlen_t count_fsb)421 xfs_zone_gc_iter_advance(
422 struct xfs_zone_gc_iter *iter,
423 xfs_extlen_t count_fsb)
424 {
425 struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx];
426
427 irec->rm_offset += count_fsb;
428 irec->rm_startblock += count_fsb;
429 irec->rm_blockcount -= count_fsb;
430 if (!irec->rm_blockcount)
431 iter->rec_idx++;
432 }
433
434 static struct xfs_rtgroup *
xfs_zone_gc_pick_victim_from(struct xfs_mount * mp,uint32_t bucket)435 xfs_zone_gc_pick_victim_from(
436 struct xfs_mount *mp,
437 uint32_t bucket)
438 {
439 struct xfs_zone_info *zi = mp->m_zone_info;
440 uint32_t victim_used = U32_MAX;
441 struct xfs_rtgroup *victim_rtg = NULL;
442 uint32_t bit;
443
444 if (!zi->zi_used_bucket_entries[bucket])
445 return NULL;
446
447 for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
448 mp->m_sb.sb_rgcount) {
449 struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
450
451 if (!rtg)
452 continue;
453
454 /* skip zones that are just waiting for a reset */
455 if (rtg_rmap(rtg)->i_used_blocks == 0 ||
456 rtg_rmap(rtg)->i_used_blocks >= victim_used) {
457 xfs_rtgroup_rele(rtg);
458 continue;
459 }
460
461 if (victim_rtg)
462 xfs_rtgroup_rele(victim_rtg);
463 victim_rtg = rtg;
464 victim_used = rtg_rmap(rtg)->i_used_blocks;
465
466 /*
467 * Any zone that is less than 1 percent used is fair game for
468 * instant reclaim. All of these zones are in the last
469 * bucket, so avoid the expensive division for the zones
470 * in the other buckets.
471 */
472 if (bucket == 0 &&
473 rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
474 break;
475 }
476
477 return victim_rtg;
478 }
479
480 /*
481 * Iterate through all zones marked as reclaimable and find a candidate to
482 * reclaim.
483 */
484 static bool
xfs_zone_gc_select_victim(struct xfs_zone_gc_data * data)485 xfs_zone_gc_select_victim(
486 struct xfs_zone_gc_data *data)
487 {
488 struct xfs_zone_gc_iter *iter = &data->iter;
489 struct xfs_mount *mp = data->mp;
490 struct xfs_zone_info *zi = mp->m_zone_info;
491 struct xfs_rtgroup *victim_rtg = NULL;
492 unsigned int bucket;
493
494 spin_lock(&zi->zi_used_buckets_lock);
495 for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
496 victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
497 if (victim_rtg)
498 break;
499 }
500 spin_unlock(&zi->zi_used_buckets_lock);
501
502 if (!victim_rtg)
503 return false;
504
505 trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
506 xfs_zone_gc_iter_init(iter, victim_rtg);
507 return true;
508 }
509
510 static struct xfs_open_zone *
xfs_zone_gc_steal_open(struct xfs_zone_info * zi)511 xfs_zone_gc_steal_open(
512 struct xfs_zone_info *zi)
513 {
514 struct xfs_open_zone *oz, *found = NULL;
515
516 spin_lock(&zi->zi_open_zones_lock);
517 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
518 if (!found || oz->oz_allocated < found->oz_allocated)
519 found = oz;
520 }
521
522 if (found) {
523 found->oz_is_gc = true;
524 list_del_init(&found->oz_entry);
525 zi->zi_nr_open_zones--;
526 }
527
528 spin_unlock(&zi->zi_open_zones_lock);
529 return found;
530 }
531
532 static struct xfs_open_zone *
xfs_zone_gc_select_target(struct xfs_mount * mp)533 xfs_zone_gc_select_target(
534 struct xfs_mount *mp)
535 {
536 struct xfs_zone_info *zi = mp->m_zone_info;
537 struct xfs_open_zone *oz = zi->zi_open_gc_zone;
538
539 /*
540 * We need to wait for pending writes to finish.
541 */
542 if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
543 return NULL;
544
545 ASSERT(zi->zi_nr_open_zones <=
546 mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
547 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
548 if (oz)
549 trace_xfs_zone_gc_target_opened(oz->oz_rtg);
550 spin_lock(&zi->zi_open_zones_lock);
551 zi->zi_open_gc_zone = oz;
552 spin_unlock(&zi->zi_open_zones_lock);
553 return oz;
554 }
555
556 /*
557 * Ensure we have a valid open zone to write the GC data to.
558 *
559 * If the current target zone has space keep writing to it, else first wait for
560 * all pending writes and then pick a new one.
561 */
562 static struct xfs_open_zone *
xfs_zone_gc_ensure_target(struct xfs_mount * mp)563 xfs_zone_gc_ensure_target(
564 struct xfs_mount *mp)
565 {
566 struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone;
567
568 if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
569 return xfs_zone_gc_select_target(mp);
570 return oz;
571 }
572
573 static unsigned int
xfs_zone_gc_scratch_available(struct xfs_zone_gc_data * data)574 xfs_zone_gc_scratch_available(
575 struct xfs_zone_gc_data *data)
576 {
577 return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
578 }
579
580 static bool
xfs_zone_gc_space_available(struct xfs_zone_gc_data * data)581 xfs_zone_gc_space_available(
582 struct xfs_zone_gc_data *data)
583 {
584 struct xfs_open_zone *oz;
585
586 oz = xfs_zone_gc_ensure_target(data->mp);
587 if (!oz)
588 return false;
589 return oz->oz_allocated < rtg_blocks(oz->oz_rtg) &&
590 xfs_zone_gc_scratch_available(data);
591 }
592
593 static void
xfs_zone_gc_end_io(struct bio * bio)594 xfs_zone_gc_end_io(
595 struct bio *bio)
596 {
597 struct xfs_gc_bio *chunk =
598 container_of(bio, struct xfs_gc_bio, bio);
599 struct xfs_zone_gc_data *data = chunk->data;
600
601 WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
602 wake_up_process(data->mp->m_zone_info->zi_gc_thread);
603 }
604
605 static struct xfs_open_zone *
xfs_zone_gc_alloc_blocks(struct xfs_zone_gc_data * data,xfs_extlen_t * count_fsb,xfs_daddr_t * daddr,bool * is_seq)606 xfs_zone_gc_alloc_blocks(
607 struct xfs_zone_gc_data *data,
608 xfs_extlen_t *count_fsb,
609 xfs_daddr_t *daddr,
610 bool *is_seq)
611 {
612 struct xfs_mount *mp = data->mp;
613 struct xfs_open_zone *oz;
614
615 oz = xfs_zone_gc_ensure_target(mp);
616 if (!oz)
617 return NULL;
618
619 *count_fsb = min(*count_fsb,
620 XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
621
622 /*
623 * Directly allocate GC blocks from the reserved pool.
624 *
625 * If we'd take them from the normal pool we could be stealing blocks
626 * from a regular writer, which would then have to wait for GC and
627 * deadlock.
628 */
629 spin_lock(&mp->m_sb_lock);
630 *count_fsb = min(*count_fsb,
631 rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
632 *count_fsb = min3(*count_fsb,
633 mp->m_free[XC_FREE_RTEXTENTS].res_avail,
634 mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
635 mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
636 mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
637 spin_unlock(&mp->m_sb_lock);
638
639 if (!*count_fsb)
640 return NULL;
641
642 *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
643 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
644 if (!*is_seq)
645 *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
646 oz->oz_allocated += *count_fsb;
647 atomic_inc(&oz->oz_ref);
648 return oz;
649 }
650
651 static bool
xfs_zone_gc_start_chunk(struct xfs_zone_gc_data * data)652 xfs_zone_gc_start_chunk(
653 struct xfs_zone_gc_data *data)
654 {
655 struct xfs_zone_gc_iter *iter = &data->iter;
656 struct xfs_mount *mp = data->mp;
657 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
658 struct xfs_open_zone *oz;
659 struct xfs_rmap_irec irec;
660 struct xfs_gc_bio *chunk;
661 struct xfs_inode *ip;
662 struct bio *bio;
663 xfs_daddr_t daddr;
664 bool is_seq;
665
666 if (xfs_is_shutdown(mp))
667 return false;
668
669 if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
670 return false;
671 oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
672 &is_seq);
673 if (!oz) {
674 xfs_irele(ip);
675 return false;
676 }
677
678 bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
679
680 chunk = container_of(bio, struct xfs_gc_bio, bio);
681 chunk->ip = ip;
682 chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
683 chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
684 chunk->old_startblock =
685 xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
686 chunk->new_daddr = daddr;
687 chunk->is_seq = is_seq;
688 chunk->scratch = &data->scratch[data->scratch_idx];
689 chunk->data = data;
690 chunk->oz = oz;
691
692 bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
693 bio->bi_end_io = xfs_zone_gc_end_io;
694 bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
695 chunk->scratch->offset);
696 chunk->scratch->offset += chunk->len;
697 if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
698 data->scratch_idx =
699 (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
700 }
701 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
702 list_add_tail(&chunk->entry, &data->reading);
703 xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
704
705 submit_bio(bio);
706 return true;
707 }
708
709 static void
xfs_zone_gc_free_chunk(struct xfs_gc_bio * chunk)710 xfs_zone_gc_free_chunk(
711 struct xfs_gc_bio *chunk)
712 {
713 list_del(&chunk->entry);
714 xfs_open_zone_put(chunk->oz);
715 xfs_irele(chunk->ip);
716 bio_put(&chunk->bio);
717 }
718
719 static void
xfs_zone_gc_submit_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)720 xfs_zone_gc_submit_write(
721 struct xfs_zone_gc_data *data,
722 struct xfs_gc_bio *chunk)
723 {
724 if (chunk->is_seq) {
725 chunk->bio.bi_opf &= ~REQ_OP_WRITE;
726 chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
727 }
728 chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
729 chunk->bio.bi_end_io = xfs_zone_gc_end_io;
730 submit_bio(&chunk->bio);
731 }
732
733 static struct xfs_gc_bio *
xfs_zone_gc_split_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)734 xfs_zone_gc_split_write(
735 struct xfs_zone_gc_data *data,
736 struct xfs_gc_bio *chunk)
737 {
738 struct queue_limits *lim =
739 &bdev_get_queue(chunk->bio.bi_bdev)->limits;
740 struct xfs_gc_bio *split_chunk;
741 int split_sectors;
742 unsigned int split_len;
743 struct bio *split;
744 unsigned int nsegs;
745
746 if (!chunk->is_seq)
747 return NULL;
748
749 split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
750 lim->max_zone_append_sectors << SECTOR_SHIFT);
751 if (!split_sectors)
752 return NULL;
753
754 /* ensure the split chunk is still block size aligned */
755 split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
756 data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
757 split_len = split_sectors << SECTOR_SHIFT;
758
759 split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
760 split_chunk = container_of(split, struct xfs_gc_bio, bio);
761 split_chunk->data = data;
762 ihold(VFS_I(chunk->ip));
763 split_chunk->ip = chunk->ip;
764 split_chunk->is_seq = chunk->is_seq;
765 split_chunk->scratch = chunk->scratch;
766 split_chunk->offset = chunk->offset;
767 split_chunk->len = split_len;
768 split_chunk->old_startblock = chunk->old_startblock;
769 split_chunk->new_daddr = chunk->new_daddr;
770 split_chunk->oz = chunk->oz;
771 atomic_inc(&chunk->oz->oz_ref);
772
773 chunk->offset += split_len;
774 chunk->len -= split_len;
775 chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
776
777 /* add right before the original chunk */
778 WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
779 list_add_tail(&split_chunk->entry, &chunk->entry);
780 return split_chunk;
781 }
782
783 static void
xfs_zone_gc_write_chunk(struct xfs_gc_bio * chunk)784 xfs_zone_gc_write_chunk(
785 struct xfs_gc_bio *chunk)
786 {
787 struct xfs_zone_gc_data *data = chunk->data;
788 struct xfs_mount *mp = chunk->ip->i_mount;
789 phys_addr_t bvec_paddr =
790 bvec_phys(bio_first_bvec_all(&chunk->bio));
791 struct xfs_gc_bio *split_chunk;
792
793 if (chunk->bio.bi_status)
794 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
795 if (xfs_is_shutdown(mp)) {
796 xfs_zone_gc_free_chunk(chunk);
797 return;
798 }
799
800 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
801 list_move_tail(&chunk->entry, &data->writing);
802
803 bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
804 bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
805 offset_in_folio(chunk->scratch->folio, bvec_paddr));
806
807 while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
808 xfs_zone_gc_submit_write(data, split_chunk);
809 xfs_zone_gc_submit_write(data, chunk);
810 }
811
812 static void
xfs_zone_gc_finish_chunk(struct xfs_gc_bio * chunk)813 xfs_zone_gc_finish_chunk(
814 struct xfs_gc_bio *chunk)
815 {
816 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
817 struct xfs_inode *ip = chunk->ip;
818 struct xfs_mount *mp = ip->i_mount;
819 int error;
820
821 if (chunk->bio.bi_status)
822 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
823 if (xfs_is_shutdown(mp)) {
824 xfs_zone_gc_free_chunk(chunk);
825 return;
826 }
827
828 chunk->scratch->freed += chunk->len;
829 if (chunk->scratch->freed == chunk->scratch->offset) {
830 chunk->scratch->offset = 0;
831 chunk->scratch->freed = 0;
832 }
833
834 /*
835 * Cycle through the iolock and wait for direct I/O and layouts to
836 * ensure no one is reading from the old mapping before it goes away.
837 *
838 * Note that xfs_zoned_end_io() below checks that no other writer raced
839 * with us to update the mapping by checking that the old startblock
840 * didn't change.
841 */
842 xfs_ilock(ip, iolock);
843 error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
844 if (!error)
845 inode_dio_wait(VFS_I(ip));
846 xfs_iunlock(ip, iolock);
847 if (error)
848 goto free;
849
850 if (chunk->is_seq)
851 chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
852 error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
853 chunk->new_daddr, chunk->oz, chunk->old_startblock);
854 free:
855 if (error)
856 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
857 xfs_zone_gc_free_chunk(chunk);
858 }
859
860 static void
xfs_zone_gc_finish_reset(struct xfs_gc_bio * chunk)861 xfs_zone_gc_finish_reset(
862 struct xfs_gc_bio *chunk)
863 {
864 struct xfs_rtgroup *rtg = chunk->bio.bi_private;
865 struct xfs_mount *mp = rtg_mount(rtg);
866 struct xfs_zone_info *zi = mp->m_zone_info;
867
868 if (chunk->bio.bi_status) {
869 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
870 goto out;
871 }
872
873 xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
874 atomic_inc(&zi->zi_nr_free_zones);
875
876 xfs_zoned_add_available(mp, rtg_blocks(rtg));
877
878 wake_up_all(&zi->zi_zone_wait);
879 out:
880 list_del(&chunk->entry);
881 bio_put(&chunk->bio);
882 }
883
884 static bool
xfs_zone_gc_prepare_reset(struct bio * bio,struct xfs_rtgroup * rtg)885 xfs_zone_gc_prepare_reset(
886 struct bio *bio,
887 struct xfs_rtgroup *rtg)
888 {
889 trace_xfs_zone_reset(rtg);
890
891 ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
892 bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
893 if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
894 if (!bdev_max_discard_sectors(bio->bi_bdev))
895 return false;
896 bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
897 bio->bi_iter.bi_size =
898 XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
899 }
900
901 return true;
902 }
903
904 int
xfs_zone_gc_reset_sync(struct xfs_rtgroup * rtg)905 xfs_zone_gc_reset_sync(
906 struct xfs_rtgroup *rtg)
907 {
908 int error = 0;
909 struct bio bio;
910
911 bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
912 REQ_OP_ZONE_RESET);
913 if (xfs_zone_gc_prepare_reset(&bio, rtg))
914 error = submit_bio_wait(&bio);
915 bio_uninit(&bio);
916
917 return error;
918 }
919
920 static void
xfs_zone_gc_reset_zones(struct xfs_zone_gc_data * data,struct xfs_group * reset_list)921 xfs_zone_gc_reset_zones(
922 struct xfs_zone_gc_data *data,
923 struct xfs_group *reset_list)
924 {
925 struct xfs_group *next = reset_list;
926
927 if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
928 xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
929 return;
930 }
931
932 do {
933 struct xfs_rtgroup *rtg = to_rtg(next);
934 struct xfs_gc_bio *chunk;
935 struct bio *bio;
936
937 xfs_log_force_inode(rtg_rmap(rtg));
938
939 next = rtg_group(rtg)->xg_next_reset;
940 rtg_group(rtg)->xg_next_reset = NULL;
941
942 bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
943 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
944 bio->bi_private = rtg;
945 bio->bi_end_io = xfs_zone_gc_end_io;
946
947 chunk = container_of(bio, struct xfs_gc_bio, bio);
948 chunk->data = data;
949 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
950 list_add_tail(&chunk->entry, &data->resetting);
951
952 /*
953 * Also use the bio to drive the state machine when neither
954 * zone reset nor discard is supported to keep things simple.
955 */
956 if (xfs_zone_gc_prepare_reset(bio, rtg))
957 submit_bio(bio);
958 else
959 bio_endio(bio);
960 } while (next);
961 }
962
963 static bool
xfs_zone_gc_should_start_new_work(struct xfs_zone_gc_data * data)964 xfs_zone_gc_should_start_new_work(
965 struct xfs_zone_gc_data *data)
966 {
967 if (xfs_is_shutdown(data->mp))
968 return false;
969 if (!xfs_zone_gc_space_available(data))
970 return false;
971
972 if (!data->iter.victim_rtg) {
973 if (kthread_should_stop() || kthread_should_park())
974 return false;
975 if (!xfs_zoned_need_gc(data->mp))
976 return false;
977 if (!xfs_zone_gc_select_victim(data))
978 return false;
979 }
980
981 return true;
982 }
983
984 /*
985 * Handle the work to read and write data for GC and to reset the zones,
986 * including handling all completions.
987 *
988 * Note that the order of the chunks is preserved so that we don't undo the
989 * optimal order established by xfs_zone_gc_query().
990 */
991 static void
xfs_zone_gc_handle_work(struct xfs_zone_gc_data * data)992 xfs_zone_gc_handle_work(
993 struct xfs_zone_gc_data *data)
994 {
995 struct xfs_zone_info *zi = data->mp->m_zone_info;
996 struct xfs_gc_bio *chunk, *next;
997 struct xfs_group *reset_list;
998 struct blk_plug plug;
999
1000 spin_lock(&zi->zi_reset_list_lock);
1001 reset_list = zi->zi_reset_list;
1002 zi->zi_reset_list = NULL;
1003 spin_unlock(&zi->zi_reset_list_lock);
1004
1005 if (reset_list) {
1006 set_current_state(TASK_RUNNING);
1007 xfs_zone_gc_reset_zones(data, reset_list);
1008 }
1009
1010 list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
1011 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1012 break;
1013 set_current_state(TASK_RUNNING);
1014 xfs_zone_gc_finish_reset(chunk);
1015 }
1016
1017 list_for_each_entry_safe(chunk, next, &data->writing, entry) {
1018 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1019 break;
1020 set_current_state(TASK_RUNNING);
1021 xfs_zone_gc_finish_chunk(chunk);
1022 }
1023
1024 blk_start_plug(&plug);
1025 list_for_each_entry_safe(chunk, next, &data->reading, entry) {
1026 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1027 break;
1028 set_current_state(TASK_RUNNING);
1029 xfs_zone_gc_write_chunk(chunk);
1030 }
1031 blk_finish_plug(&plug);
1032
1033 if (xfs_zone_gc_should_start_new_work(data)) {
1034 set_current_state(TASK_RUNNING);
1035 blk_start_plug(&plug);
1036 while (xfs_zone_gc_start_chunk(data))
1037 ;
1038 blk_finish_plug(&plug);
1039 }
1040 }
1041
1042 /*
1043 * Note that the current GC algorithm would break reflinks and thus duplicate
1044 * data that was shared by multiple owners before. Because of that reflinks
1045 * are currently not supported on zoned file systems and can't be created or
1046 * mounted.
1047 */
1048 static int
xfs_zoned_gcd(void * private)1049 xfs_zoned_gcd(
1050 void *private)
1051 {
1052 struct xfs_zone_gc_data *data = private;
1053 struct xfs_mount *mp = data->mp;
1054 struct xfs_zone_info *zi = mp->m_zone_info;
1055 unsigned int nofs_flag;
1056
1057 nofs_flag = memalloc_nofs_save();
1058 set_freezable();
1059
1060 for (;;) {
1061 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1062 xfs_set_zonegc_running(mp);
1063
1064 xfs_zone_gc_handle_work(data);
1065
1066 /*
1067 * Only sleep if nothing set the state to running. Else check for
1068 * work again as someone might have queued up more work and woken
1069 * us in the meantime.
1070 */
1071 if (get_current_state() == TASK_RUNNING) {
1072 try_to_freeze();
1073 continue;
1074 }
1075
1076 if (list_empty(&data->reading) &&
1077 list_empty(&data->writing) &&
1078 list_empty(&data->resetting) &&
1079 !zi->zi_reset_list) {
1080 xfs_clear_zonegc_running(mp);
1081 xfs_zoned_resv_wake_all(mp);
1082
1083 if (kthread_should_stop()) {
1084 __set_current_state(TASK_RUNNING);
1085 break;
1086 }
1087
1088 if (kthread_should_park()) {
1089 __set_current_state(TASK_RUNNING);
1090 kthread_parkme();
1091 continue;
1092 }
1093 }
1094
1095 schedule();
1096 }
1097 xfs_clear_zonegc_running(mp);
1098
1099 if (data->iter.victim_rtg)
1100 xfs_rtgroup_rele(data->iter.victim_rtg);
1101
1102 memalloc_nofs_restore(nofs_flag);
1103 xfs_zone_gc_data_free(data);
1104 return 0;
1105 }
1106
1107 void
xfs_zone_gc_start(struct xfs_mount * mp)1108 xfs_zone_gc_start(
1109 struct xfs_mount *mp)
1110 {
1111 if (xfs_has_zoned(mp))
1112 kthread_unpark(mp->m_zone_info->zi_gc_thread);
1113 }
1114
1115 void
xfs_zone_gc_stop(struct xfs_mount * mp)1116 xfs_zone_gc_stop(
1117 struct xfs_mount *mp)
1118 {
1119 if (xfs_has_zoned(mp))
1120 kthread_park(mp->m_zone_info->zi_gc_thread);
1121 }
1122
1123 int
xfs_zone_gc_mount(struct xfs_mount * mp)1124 xfs_zone_gc_mount(
1125 struct xfs_mount *mp)
1126 {
1127 struct xfs_zone_info *zi = mp->m_zone_info;
1128 struct xfs_zone_gc_data *data;
1129 struct xfs_open_zone *oz;
1130 int error;
1131
1132 /*
1133 * If there are no free zones available for GC, pick the open zone with
1134 * the least used space to GC into. This should only happen after an
1135 * unclean shutdown near ENOSPC while GC was ongoing.
1136 *
1137 * We also need to do this for the first gc zone allocation if we
1138 * unmounted while at the open limit.
1139 */
1140 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
1141 zi->zi_nr_open_zones == mp->m_max_open_zones)
1142 oz = xfs_zone_gc_steal_open(zi);
1143 else
1144 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
1145 if (!oz) {
1146 xfs_warn(mp, "unable to allocate a zone for gc");
1147 error = -EIO;
1148 goto out;
1149 }
1150
1151 trace_xfs_zone_gc_target_opened(oz->oz_rtg);
1152 zi->zi_open_gc_zone = oz;
1153
1154 data = xfs_zone_gc_data_alloc(mp);
1155 if (!data) {
1156 error = -ENOMEM;
1157 goto out_put_gc_zone;
1158 }
1159
1160 mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
1161 "xfs-zone-gc/%s", mp->m_super->s_id);
1162 if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
1163 xfs_warn(mp, "unable to create zone gc thread");
1164 error = PTR_ERR(mp->m_zone_info->zi_gc_thread);
1165 goto out_free_gc_data;
1166 }
1167
1168 /* xfs_zone_gc_start will unpark for rw mounts */
1169 kthread_park(mp->m_zone_info->zi_gc_thread);
1170 return 0;
1171
1172 out_free_gc_data:
1173 kfree(data);
1174 out_put_gc_zone:
1175 xfs_open_zone_put(zi->zi_open_gc_zone);
1176 out:
1177 return error;
1178 }
1179
1180 void
xfs_zone_gc_unmount(struct xfs_mount * mp)1181 xfs_zone_gc_unmount(
1182 struct xfs_mount *mp)
1183 {
1184 struct xfs_zone_info *zi = mp->m_zone_info;
1185
1186 kthread_stop(zi->zi_gc_thread);
1187 if (zi->zi_open_gc_zone)
1188 xfs_open_zone_put(zi->zi_open_gc_zone);
1189 }
1190