xref: /linux/fs/xfs/xfs_zone_gc.c (revision cd188e9ef80fd005fd8c8de34ed649bd653d00e5)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2023-2025 Christoph Hellwig.
4  * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5  */
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "xfs_trans.h"
15 #include "xfs_icache.h"
16 #include "xfs_rmap.h"
17 #include "xfs_rtbitmap.h"
18 #include "xfs_rtrmap_btree.h"
19 #include "xfs_zone_alloc.h"
20 #include "xfs_zone_priv.h"
21 #include "xfs_zones.h"
22 #include "xfs_trace.h"
23 
24 /*
25  * Implement Garbage Collection (GC) of partially used zoned.
26  *
27  * To support the purely sequential writes in each zone, zoned XFS needs to be
28  * able to move data remaining in a zone out of it to reset the zone to prepare
29  * for writing to it again.
30  *
31  * This is done by the GC thread implemented in this file.  To support that a
32  * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
33  * write the garbage collected data into.
34  *
35  * Whenever the available space is below the chosen threshold, the GC thread
36  * looks for potential non-empty but not fully used zones that are worth
37  * reclaiming.  Once found the rmap for the victim zone is queried, and after
38  * a bit of sorting to reduce fragmentation, the still live extents are read
39  * into memory and written to the GC target zone, and the bmap btree of the
40  * files is updated to point to the new location.  To avoid taking the IOLOCK
41  * and MMAPLOCK for the entire GC process and thus affecting the latency of
42  * user reads and writes to the files, the GC writes are speculative and the
43  * I/O completion checks that no other writes happened for the affected regions
44  * before remapping.
45  *
46  * Once a zone does not contain any valid data, be that through GC or user
47  * block removal, it is queued for for a zone reset.  The reset operation
48  * carefully ensures that the RT device cache is flushed and all transactions
49  * referencing the rmap have been committed to disk.
50  */
51 
52 /*
53  * Size of each GC scratch pad.  This is also the upper bound for each
54  * GC I/O, which helps to keep latency down.
55  */
56 #define XFS_GC_CHUNK_SIZE	SZ_1M
57 
58 /*
59  * Scratchpad data to read GCed data into.
60  *
61  * The offset member tracks where the next allocation starts, and freed tracks
62  * the amount of space that is not used anymore.
63  */
64 #define XFS_ZONE_GC_NR_SCRATCH	2
65 struct xfs_zone_scratch {
66 	struct folio			*folio;
67 	unsigned int			offset;
68 	unsigned int			freed;
69 };
70 
71 /*
72  * Chunk that is read and written for each GC operation.
73  *
74  * Note that for writes to actual zoned devices, the chunk can be split when
75  * reaching the hardware limit.
76  */
77 struct xfs_gc_bio {
78 	struct xfs_zone_gc_data		*data;
79 
80 	/*
81 	 * Entry into the reading/writing/resetting list.  Only accessed from
82 	 * the GC thread, so no locking needed.
83 	 */
84 	struct list_head		entry;
85 
86 	/*
87 	 * State of this gc_bio.  Done means the current I/O completed.
88 	 * Set from the bio end I/O handler, read from the GC thread.
89 	 */
90 	enum {
91 		XFS_GC_BIO_NEW,
92 		XFS_GC_BIO_DONE,
93 	} state;
94 
95 	/*
96 	 * Pointer to the inode and byte range in the inode that this
97 	 * GC chunk is operating on.
98 	 */
99 	struct xfs_inode		*ip;
100 	loff_t				offset;
101 	unsigned int			len;
102 
103 	/*
104 	 * Existing startblock (in the zone to be freed) and newly assigned
105 	 * daddr in the zone GCed into.
106 	 */
107 	xfs_fsblock_t			old_startblock;
108 	xfs_daddr_t			new_daddr;
109 	struct xfs_zone_scratch		*scratch;
110 
111 	/* Are we writing to a sequential write required zone? */
112 	bool				is_seq;
113 
114 	/* Open Zone being written to */
115 	struct xfs_open_zone		*oz;
116 
117 	/* Bio used for reads and writes, including the bvec used by it */
118 	struct bio_vec			bv;
119 	struct bio			bio;	/* must be last */
120 };
121 
122 #define XFS_ZONE_GC_RECS		1024
123 
124 /* iterator, needs to be reinitialized for each victim zone */
125 struct xfs_zone_gc_iter {
126 	struct xfs_rtgroup		*victim_rtg;
127 	unsigned int			rec_count;
128 	unsigned int			rec_idx;
129 	xfs_agblock_t			next_startblock;
130 	struct xfs_rmap_irec		*recs;
131 };
132 
133 /*
134  * Per-mount GC state.
135  */
136 struct xfs_zone_gc_data {
137 	struct xfs_mount		*mp;
138 
139 	/* bioset used to allocate the gc_bios */
140 	struct bio_set			bio_set;
141 
142 	/*
143 	 * Scratchpad used, and index to indicated which one is used.
144 	 */
145 	struct xfs_zone_scratch		scratch[XFS_ZONE_GC_NR_SCRATCH];
146 	unsigned int			scratch_idx;
147 
148 	/*
149 	 * List of bios currently being read, written and reset.
150 	 * These lists are only accessed by the GC thread itself, and must only
151 	 * be processed in order.
152 	 */
153 	struct list_head		reading;
154 	struct list_head		writing;
155 	struct list_head		resetting;
156 
157 	/*
158 	 * Iterator for the victim zone.
159 	 */
160 	struct xfs_zone_gc_iter		iter;
161 };
162 
163 /*
164  * We aim to keep enough zones free in stock to fully use the open zone limit
165  * for data placement purposes. Additionally, the m_zonegc_low_space tunable
166  * can be set to make sure a fraction of the unused blocks are available for
167  * writing.
168  */
169 bool
170 xfs_zoned_need_gc(
171 	struct xfs_mount	*mp)
172 {
173 	s64			available, free;
174 
175 	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
176 		return false;
177 
178 	available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
179 
180 	if (available <
181 	    mp->m_groups[XG_TYPE_RTG].blocks *
182 	    (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
183 		return true;
184 
185 	free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
186 	if (available < mult_frac(free, mp->m_zonegc_low_space, 100))
187 		return true;
188 
189 	return false;
190 }
191 
192 static struct xfs_zone_gc_data *
193 xfs_zone_gc_data_alloc(
194 	struct xfs_mount	*mp)
195 {
196 	struct xfs_zone_gc_data	*data;
197 	int			i;
198 
199 	data = kzalloc(sizeof(*data), GFP_KERNEL);
200 	if (!data)
201 		return NULL;
202 	data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
203 			GFP_KERNEL);
204 	if (!data->iter.recs)
205 		goto out_free_data;
206 
207 	/*
208 	 * We actually only need a single bio_vec.  It would be nice to have
209 	 * a flag that only allocates the inline bvecs and not the separate
210 	 * bvec pool.
211 	 */
212 	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
213 			BIOSET_NEED_BVECS))
214 		goto out_free_recs;
215 	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
216 		data->scratch[i].folio =
217 			folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
218 		if (!data->scratch[i].folio)
219 			goto out_free_scratch;
220 	}
221 	INIT_LIST_HEAD(&data->reading);
222 	INIT_LIST_HEAD(&data->writing);
223 	INIT_LIST_HEAD(&data->resetting);
224 	data->mp = mp;
225 	return data;
226 
227 out_free_scratch:
228 	while (--i >= 0)
229 		folio_put(data->scratch[i].folio);
230 	bioset_exit(&data->bio_set);
231 out_free_recs:
232 	kfree(data->iter.recs);
233 out_free_data:
234 	kfree(data);
235 	return NULL;
236 }
237 
238 static void
239 xfs_zone_gc_data_free(
240 	struct xfs_zone_gc_data	*data)
241 {
242 	int			i;
243 
244 	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
245 		folio_put(data->scratch[i].folio);
246 	bioset_exit(&data->bio_set);
247 	kfree(data->iter.recs);
248 	kfree(data);
249 }
250 
251 static void
252 xfs_zone_gc_iter_init(
253 	struct xfs_zone_gc_iter	*iter,
254 	struct xfs_rtgroup	*victim_rtg)
255 
256 {
257 	iter->next_startblock = 0;
258 	iter->rec_count = 0;
259 	iter->rec_idx = 0;
260 	iter->victim_rtg = victim_rtg;
261 }
262 
263 /*
264  * Query the rmap of the victim zone to gather the records to evacuate.
265  */
266 static int
267 xfs_zone_gc_query_cb(
268 	struct xfs_btree_cur	*cur,
269 	const struct xfs_rmap_irec *irec,
270 	void			*private)
271 {
272 	struct xfs_zone_gc_iter	*iter = private;
273 
274 	ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
275 	ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
276 	ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
277 
278 	iter->recs[iter->rec_count] = *irec;
279 	if (++iter->rec_count == XFS_ZONE_GC_RECS) {
280 		iter->next_startblock =
281 			irec->rm_startblock + irec->rm_blockcount;
282 		return 1;
283 	}
284 	return 0;
285 }
286 
287 #define cmp_int(l, r)		((l > r) - (l < r))
288 
289 static int
290 xfs_zone_gc_rmap_rec_cmp(
291 	const void			*a,
292 	const void			*b)
293 {
294 	const struct xfs_rmap_irec	*reca = a;
295 	const struct xfs_rmap_irec	*recb = b;
296 	int				diff;
297 
298 	diff = cmp_int(reca->rm_owner, recb->rm_owner);
299 	if (diff)
300 		return diff;
301 	return cmp_int(reca->rm_offset, recb->rm_offset);
302 }
303 
304 static int
305 xfs_zone_gc_query(
306 	struct xfs_mount	*mp,
307 	struct xfs_zone_gc_iter	*iter)
308 {
309 	struct xfs_rtgroup	*rtg = iter->victim_rtg;
310 	struct xfs_rmap_irec	ri_low = { };
311 	struct xfs_rmap_irec	ri_high;
312 	struct xfs_btree_cur	*cur;
313 	struct xfs_trans	*tp;
314 	int			error;
315 
316 	ASSERT(iter->next_startblock <= rtg_blocks(rtg));
317 	if (iter->next_startblock == rtg_blocks(rtg))
318 		goto done;
319 
320 	ASSERT(iter->next_startblock < rtg_blocks(rtg));
321 	ri_low.rm_startblock = iter->next_startblock;
322 	memset(&ri_high, 0xFF, sizeof(ri_high));
323 
324 	iter->rec_idx = 0;
325 	iter->rec_count = 0;
326 
327 	error = xfs_trans_alloc_empty(mp, &tp);
328 	if (error)
329 		return error;
330 
331 	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
332 	cur = xfs_rtrmapbt_init_cursor(tp, rtg);
333 	error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
334 			xfs_zone_gc_query_cb, iter);
335 	xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
336 	xfs_btree_del_cursor(cur, error < 0 ? error : 0);
337 	xfs_trans_cancel(tp);
338 
339 	if (error < 0)
340 		return error;
341 
342 	/*
343 	 * Sort the rmap records by inode number and increasing offset to
344 	 * defragment the mappings.
345 	 *
346 	 * This could be further enhanced by an even bigger look ahead window,
347 	 * but that's better left until we have better detection of changes to
348 	 * inode mapping to avoid the potential of GCing already dead data.
349 	 */
350 	sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
351 			xfs_zone_gc_rmap_rec_cmp, NULL);
352 
353 	if (error == 0) {
354 		/*
355 		 * We finished iterating through the zone.
356 		 */
357 		iter->next_startblock = rtg_blocks(rtg);
358 		if (iter->rec_count == 0)
359 			goto done;
360 	}
361 
362 	return 0;
363 done:
364 	xfs_rtgroup_rele(iter->victim_rtg);
365 	iter->victim_rtg = NULL;
366 	return 0;
367 }
368 
369 static bool
370 xfs_zone_gc_iter_next(
371 	struct xfs_mount	*mp,
372 	struct xfs_zone_gc_iter	*iter,
373 	struct xfs_rmap_irec	*chunk_rec,
374 	struct xfs_inode	**ipp)
375 {
376 	struct xfs_rmap_irec	*irec;
377 	int			error;
378 
379 	if (!iter->victim_rtg)
380 		return false;
381 
382 retry:
383 	if (iter->rec_idx == iter->rec_count) {
384 		error = xfs_zone_gc_query(mp, iter);
385 		if (error)
386 			goto fail;
387 		if (!iter->victim_rtg)
388 			return false;
389 	}
390 
391 	irec = &iter->recs[iter->rec_idx];
392 	error = xfs_iget(mp, NULL, irec->rm_owner,
393 			XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
394 	if (error) {
395 		/*
396 		 * If the inode was already deleted, skip over it.
397 		 */
398 		if (error == -ENOENT) {
399 			iter->rec_idx++;
400 			goto retry;
401 		}
402 		goto fail;
403 	}
404 
405 	if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
406 		iter->rec_idx++;
407 		xfs_irele(*ipp);
408 		goto retry;
409 	}
410 
411 	*chunk_rec = *irec;
412 	return true;
413 
414 fail:
415 	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
416 	return false;
417 }
418 
419 static void
420 xfs_zone_gc_iter_advance(
421 	struct xfs_zone_gc_iter	*iter,
422 	xfs_extlen_t		count_fsb)
423 {
424 	struct xfs_rmap_irec	*irec = &iter->recs[iter->rec_idx];
425 
426 	irec->rm_offset += count_fsb;
427 	irec->rm_startblock += count_fsb;
428 	irec->rm_blockcount -= count_fsb;
429 	if (!irec->rm_blockcount)
430 		iter->rec_idx++;
431 }
432 
433 static struct xfs_rtgroup *
434 xfs_zone_gc_pick_victim_from(
435 	struct xfs_mount	*mp,
436 	uint32_t		bucket)
437 {
438 	struct xfs_zone_info	*zi = mp->m_zone_info;
439 	uint32_t		victim_used = U32_MAX;
440 	struct xfs_rtgroup	*victim_rtg = NULL;
441 	uint32_t		bit;
442 
443 	if (!zi->zi_used_bucket_entries[bucket])
444 		return NULL;
445 
446 	for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
447 			mp->m_sb.sb_rgcount) {
448 		struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
449 
450 		if (!rtg)
451 			continue;
452 
453 		/* skip zones that are just waiting for a reset */
454 		if (rtg_rmap(rtg)->i_used_blocks == 0 ||
455 		    rtg_rmap(rtg)->i_used_blocks >= victim_used) {
456 			xfs_rtgroup_rele(rtg);
457 			continue;
458 		}
459 
460 		if (victim_rtg)
461 			xfs_rtgroup_rele(victim_rtg);
462 		victim_rtg = rtg;
463 		victim_used = rtg_rmap(rtg)->i_used_blocks;
464 
465 		/*
466 		 * Any zone that is less than 1 percent used is fair game for
467 		 * instant reclaim. All of these zones are in the last
468 		 * bucket, so avoid the expensive division for the zones
469 		 * in the other buckets.
470 		 */
471 		if (bucket == 0 &&
472 		    rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
473 			break;
474 	}
475 
476 	return victim_rtg;
477 }
478 
479 /*
480  * Iterate through all zones marked as reclaimable and find a candidate to
481  * reclaim.
482  */
483 static bool
484 xfs_zone_gc_select_victim(
485 	struct xfs_zone_gc_data	*data)
486 {
487 	struct xfs_zone_gc_iter	*iter = &data->iter;
488 	struct xfs_mount	*mp = data->mp;
489 	struct xfs_zone_info	*zi = mp->m_zone_info;
490 	struct xfs_rtgroup	*victim_rtg = NULL;
491 	unsigned int		bucket;
492 
493 	if (xfs_is_shutdown(mp))
494 		return false;
495 
496 	if (iter->victim_rtg)
497 		return true;
498 
499 	/*
500 	 * Don't start new work if we are asked to stop or park.
501 	 */
502 	if (kthread_should_stop() || kthread_should_park())
503 		return false;
504 
505 	if (!xfs_zoned_need_gc(mp))
506 		return false;
507 
508 	spin_lock(&zi->zi_used_buckets_lock);
509 	for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
510 		victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
511 		if (victim_rtg)
512 			break;
513 	}
514 	spin_unlock(&zi->zi_used_buckets_lock);
515 
516 	if (!victim_rtg)
517 		return false;
518 
519 	trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
520 	xfs_zone_gc_iter_init(iter, victim_rtg);
521 	return true;
522 }
523 
524 static struct xfs_open_zone *
525 xfs_zone_gc_steal_open(
526 	struct xfs_zone_info	*zi)
527 {
528 	struct xfs_open_zone	*oz, *found = NULL;
529 
530 	spin_lock(&zi->zi_open_zones_lock);
531 	list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
532 		if (!found ||
533 		    oz->oz_write_pointer < found->oz_write_pointer)
534 			found = oz;
535 	}
536 
537 	if (found) {
538 		found->oz_is_gc = true;
539 		list_del_init(&found->oz_entry);
540 		zi->zi_nr_open_zones--;
541 	}
542 
543 	spin_unlock(&zi->zi_open_zones_lock);
544 	return found;
545 }
546 
547 static struct xfs_open_zone *
548 xfs_zone_gc_select_target(
549 	struct xfs_mount	*mp)
550 {
551 	struct xfs_zone_info	*zi = mp->m_zone_info;
552 	struct xfs_open_zone	*oz = zi->zi_open_gc_zone;
553 
554 	/*
555 	 * We need to wait for pending writes to finish.
556 	 */
557 	if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
558 		return NULL;
559 
560 	ASSERT(zi->zi_nr_open_zones <=
561 		mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
562 	oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
563 	if (oz)
564 		trace_xfs_zone_gc_target_opened(oz->oz_rtg);
565 	spin_lock(&zi->zi_open_zones_lock);
566 	zi->zi_open_gc_zone = oz;
567 	spin_unlock(&zi->zi_open_zones_lock);
568 	return oz;
569 }
570 
571 /*
572  * Ensure we have a valid open zone to write the GC data to.
573  *
574  * If the current target zone has space keep writing to it, else first wait for
575  * all pending writes and then pick a new one.
576  */
577 static struct xfs_open_zone *
578 xfs_zone_gc_ensure_target(
579 	struct xfs_mount	*mp)
580 {
581 	struct xfs_open_zone	*oz = mp->m_zone_info->zi_open_gc_zone;
582 
583 	if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
584 		return xfs_zone_gc_select_target(mp);
585 	return oz;
586 }
587 
588 static unsigned int
589 xfs_zone_gc_scratch_available(
590 	struct xfs_zone_gc_data	*data)
591 {
592 	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
593 }
594 
595 static bool
596 xfs_zone_gc_space_available(
597 	struct xfs_zone_gc_data	*data)
598 {
599 	struct xfs_open_zone	*oz;
600 
601 	oz = xfs_zone_gc_ensure_target(data->mp);
602 	if (!oz)
603 		return false;
604 	return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) &&
605 		xfs_zone_gc_scratch_available(data);
606 }
607 
608 static void
609 xfs_zone_gc_end_io(
610 	struct bio		*bio)
611 {
612 	struct xfs_gc_bio	*chunk =
613 		container_of(bio, struct xfs_gc_bio, bio);
614 	struct xfs_zone_gc_data	*data = chunk->data;
615 
616 	WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
617 	wake_up_process(data->mp->m_zone_info->zi_gc_thread);
618 }
619 
620 static struct xfs_open_zone *
621 xfs_zone_gc_alloc_blocks(
622 	struct xfs_zone_gc_data	*data,
623 	xfs_extlen_t		*count_fsb,
624 	xfs_daddr_t		*daddr,
625 	bool			*is_seq)
626 {
627 	struct xfs_mount	*mp = data->mp;
628 	struct xfs_open_zone	*oz;
629 
630 	oz = xfs_zone_gc_ensure_target(mp);
631 	if (!oz)
632 		return NULL;
633 
634 	*count_fsb = min(*count_fsb,
635 		XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
636 
637 	/*
638 	 * Directly allocate GC blocks from the reserved pool.
639 	 *
640 	 * If we'd take them from the normal pool we could be stealing blocks
641 	 * from a regular writer, which would then have to wait for GC and
642 	 * deadlock.
643 	 */
644 	spin_lock(&mp->m_sb_lock);
645 	*count_fsb = min(*count_fsb,
646 			rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer);
647 	*count_fsb = min3(*count_fsb,
648 			mp->m_free[XC_FREE_RTEXTENTS].res_avail,
649 			mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
650 	mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
651 	mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
652 	spin_unlock(&mp->m_sb_lock);
653 
654 	if (!*count_fsb)
655 		return NULL;
656 
657 	*daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
658 	*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
659 	if (!*is_seq)
660 		*daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer);
661 	oz->oz_write_pointer += *count_fsb;
662 	atomic_inc(&oz->oz_ref);
663 	return oz;
664 }
665 
666 static bool
667 xfs_zone_gc_start_chunk(
668 	struct xfs_zone_gc_data	*data)
669 {
670 	struct xfs_zone_gc_iter	*iter = &data->iter;
671 	struct xfs_mount	*mp = data->mp;
672 	struct block_device	*bdev = mp->m_rtdev_targp->bt_bdev;
673 	struct xfs_open_zone	*oz;
674 	struct xfs_rmap_irec	irec;
675 	struct xfs_gc_bio	*chunk;
676 	struct xfs_inode	*ip;
677 	struct bio		*bio;
678 	xfs_daddr_t		daddr;
679 	bool			is_seq;
680 
681 	if (xfs_is_shutdown(mp))
682 		return false;
683 
684 	if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
685 		return false;
686 	oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
687 			&is_seq);
688 	if (!oz) {
689 		xfs_irele(ip);
690 		return false;
691 	}
692 
693 	bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
694 
695 	chunk = container_of(bio, struct xfs_gc_bio, bio);
696 	chunk->ip = ip;
697 	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
698 	chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
699 	chunk->old_startblock =
700 		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
701 	chunk->new_daddr = daddr;
702 	chunk->is_seq = is_seq;
703 	chunk->scratch = &data->scratch[data->scratch_idx];
704 	chunk->data = data;
705 	chunk->oz = oz;
706 
707 	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
708 	bio->bi_end_io = xfs_zone_gc_end_io;
709 	bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
710 			chunk->scratch->offset);
711 	chunk->scratch->offset += chunk->len;
712 	if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
713 		data->scratch_idx =
714 			(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
715 	}
716 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
717 	list_add_tail(&chunk->entry, &data->reading);
718 	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
719 
720 	submit_bio(bio);
721 	return true;
722 }
723 
724 static void
725 xfs_zone_gc_free_chunk(
726 	struct xfs_gc_bio	*chunk)
727 {
728 	list_del(&chunk->entry);
729 	xfs_open_zone_put(chunk->oz);
730 	xfs_irele(chunk->ip);
731 	bio_put(&chunk->bio);
732 }
733 
734 static void
735 xfs_zone_gc_submit_write(
736 	struct xfs_zone_gc_data	*data,
737 	struct xfs_gc_bio	*chunk)
738 {
739 	if (chunk->is_seq) {
740 		chunk->bio.bi_opf &= ~REQ_OP_WRITE;
741 		chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
742 	}
743 	chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
744 	chunk->bio.bi_end_io = xfs_zone_gc_end_io;
745 	submit_bio(&chunk->bio);
746 }
747 
748 static struct xfs_gc_bio *
749 xfs_zone_gc_split_write(
750 	struct xfs_zone_gc_data	*data,
751 	struct xfs_gc_bio	*chunk)
752 {
753 	struct queue_limits	*lim =
754 		&bdev_get_queue(chunk->bio.bi_bdev)->limits;
755 	struct xfs_gc_bio	*split_chunk;
756 	int			split_sectors;
757 	unsigned int		split_len;
758 	struct bio		*split;
759 	unsigned int		nsegs;
760 
761 	if (!chunk->is_seq)
762 		return NULL;
763 
764 	split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
765 			lim->max_zone_append_sectors << SECTOR_SHIFT);
766 	if (!split_sectors)
767 		return NULL;
768 
769 	/* ensure the split chunk is still block size aligned */
770 	split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
771 			data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
772 	split_len = split_sectors << SECTOR_SHIFT;
773 
774 	split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
775 	split_chunk = container_of(split, struct xfs_gc_bio, bio);
776 	split_chunk->data = data;
777 	ihold(VFS_I(chunk->ip));
778 	split_chunk->ip = chunk->ip;
779 	split_chunk->is_seq = chunk->is_seq;
780 	split_chunk->scratch = chunk->scratch;
781 	split_chunk->offset = chunk->offset;
782 	split_chunk->len = split_len;
783 	split_chunk->old_startblock = chunk->old_startblock;
784 	split_chunk->new_daddr = chunk->new_daddr;
785 	split_chunk->oz = chunk->oz;
786 	atomic_inc(&chunk->oz->oz_ref);
787 
788 	chunk->offset += split_len;
789 	chunk->len -= split_len;
790 	chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
791 
792 	/* add right before the original chunk */
793 	WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
794 	list_add_tail(&split_chunk->entry, &chunk->entry);
795 	return split_chunk;
796 }
797 
798 static void
799 xfs_zone_gc_write_chunk(
800 	struct xfs_gc_bio	*chunk)
801 {
802 	struct xfs_zone_gc_data	*data = chunk->data;
803 	struct xfs_mount	*mp = chunk->ip->i_mount;
804 	unsigned int		folio_offset = chunk->bio.bi_io_vec->bv_offset;
805 	struct xfs_gc_bio	*split_chunk;
806 
807 	if (chunk->bio.bi_status)
808 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
809 	if (xfs_is_shutdown(mp)) {
810 		xfs_zone_gc_free_chunk(chunk);
811 		return;
812 	}
813 
814 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
815 	list_move_tail(&chunk->entry, &data->writing);
816 
817 	bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
818 	bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
819 			folio_offset);
820 
821 	while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
822 		xfs_zone_gc_submit_write(data, split_chunk);
823 	xfs_zone_gc_submit_write(data, chunk);
824 }
825 
826 static void
827 xfs_zone_gc_finish_chunk(
828 	struct xfs_gc_bio	*chunk)
829 {
830 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
831 	struct xfs_inode	*ip = chunk->ip;
832 	struct xfs_mount	*mp = ip->i_mount;
833 	int			error;
834 
835 	if (chunk->bio.bi_status)
836 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
837 	if (xfs_is_shutdown(mp)) {
838 		xfs_zone_gc_free_chunk(chunk);
839 		return;
840 	}
841 
842 	chunk->scratch->freed += chunk->len;
843 	if (chunk->scratch->freed == chunk->scratch->offset) {
844 		chunk->scratch->offset = 0;
845 		chunk->scratch->freed = 0;
846 	}
847 
848 	/*
849 	 * Cycle through the iolock and wait for direct I/O and layouts to
850 	 * ensure no one is reading from the old mapping before it goes away.
851 	 *
852 	 * Note that xfs_zoned_end_io() below checks that no other writer raced
853 	 * with us to update the mapping by checking that the old startblock
854 	 * didn't change.
855 	 */
856 	xfs_ilock(ip, iolock);
857 	error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
858 	if (!error)
859 		inode_dio_wait(VFS_I(ip));
860 	xfs_iunlock(ip, iolock);
861 	if (error)
862 		goto free;
863 
864 	if (chunk->is_seq)
865 		chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
866 	error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
867 			chunk->new_daddr, chunk->oz, chunk->old_startblock);
868 free:
869 	if (error)
870 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
871 	xfs_zone_gc_free_chunk(chunk);
872 }
873 
874 static void
875 xfs_zone_gc_finish_reset(
876 	struct xfs_gc_bio	*chunk)
877 {
878 	struct xfs_rtgroup	*rtg = chunk->bio.bi_private;
879 	struct xfs_mount	*mp = rtg_mount(rtg);
880 	struct xfs_zone_info	*zi = mp->m_zone_info;
881 
882 	if (chunk->bio.bi_status) {
883 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
884 		goto out;
885 	}
886 
887 	xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
888 	atomic_inc(&zi->zi_nr_free_zones);
889 
890 	xfs_zoned_add_available(mp, rtg_blocks(rtg));
891 
892 	wake_up_all(&zi->zi_zone_wait);
893 out:
894 	list_del(&chunk->entry);
895 	bio_put(&chunk->bio);
896 }
897 
898 static bool
899 xfs_zone_gc_prepare_reset(
900 	struct bio		*bio,
901 	struct xfs_rtgroup	*rtg)
902 {
903 	trace_xfs_zone_reset(rtg);
904 
905 	ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
906 	bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
907 	if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
908 		if (!bdev_max_discard_sectors(bio->bi_bdev))
909 			return false;
910 		bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
911 		bio->bi_iter.bi_size =
912 			XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
913 	}
914 
915 	return true;
916 }
917 
918 int
919 xfs_zone_gc_reset_sync(
920 	struct xfs_rtgroup	*rtg)
921 {
922 	int			error = 0;
923 	struct bio		bio;
924 
925 	bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
926 			REQ_OP_ZONE_RESET);
927 	if (xfs_zone_gc_prepare_reset(&bio, rtg))
928 		error = submit_bio_wait(&bio);
929 	bio_uninit(&bio);
930 
931 	return error;
932 }
933 
934 static void
935 xfs_zone_gc_reset_zones(
936 	struct xfs_zone_gc_data	*data,
937 	struct xfs_group	*reset_list)
938 {
939 	struct xfs_group	*next = reset_list;
940 
941 	if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
942 		xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
943 		return;
944 	}
945 
946 	do {
947 		struct xfs_rtgroup	*rtg = to_rtg(next);
948 		struct xfs_gc_bio	*chunk;
949 		struct bio		*bio;
950 
951 		xfs_log_force_inode(rtg_rmap(rtg));
952 
953 		next = rtg_group(rtg)->xg_next_reset;
954 		rtg_group(rtg)->xg_next_reset = NULL;
955 
956 		bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
957 				0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
958 		bio->bi_private = rtg;
959 		bio->bi_end_io = xfs_zone_gc_end_io;
960 
961 		chunk = container_of(bio, struct xfs_gc_bio, bio);
962 		chunk->data = data;
963 		WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
964 		list_add_tail(&chunk->entry, &data->resetting);
965 
966 		/*
967 		 * Also use the bio to drive the state machine when neither
968 		 * zone reset nor discard is supported to keep things simple.
969 		 */
970 		if (xfs_zone_gc_prepare_reset(bio, rtg))
971 			submit_bio(bio);
972 		else
973 			bio_endio(bio);
974 	} while (next);
975 }
976 
977 /*
978  * Handle the work to read and write data for GC and to reset the zones,
979  * including handling all completions.
980  *
981  * Note that the order of the chunks is preserved so that we don't undo the
982  * optimal order established by xfs_zone_gc_query().
983  */
984 static bool
985 xfs_zone_gc_handle_work(
986 	struct xfs_zone_gc_data	*data)
987 {
988 	struct xfs_zone_info	*zi = data->mp->m_zone_info;
989 	struct xfs_gc_bio	*chunk, *next;
990 	struct xfs_group	*reset_list;
991 	struct blk_plug		plug;
992 
993 	spin_lock(&zi->zi_reset_list_lock);
994 	reset_list = zi->zi_reset_list;
995 	zi->zi_reset_list = NULL;
996 	spin_unlock(&zi->zi_reset_list_lock);
997 
998 	if (!xfs_zone_gc_select_victim(data) ||
999 	    !xfs_zone_gc_space_available(data)) {
1000 		if (list_empty(&data->reading) &&
1001 		    list_empty(&data->writing) &&
1002 		    list_empty(&data->resetting) &&
1003 		    !reset_list)
1004 			return false;
1005 	}
1006 
1007 	__set_current_state(TASK_RUNNING);
1008 	try_to_freeze();
1009 
1010 	if (reset_list)
1011 		xfs_zone_gc_reset_zones(data, reset_list);
1012 
1013 	list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
1014 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1015 			break;
1016 		xfs_zone_gc_finish_reset(chunk);
1017 	}
1018 
1019 	list_for_each_entry_safe(chunk, next, &data->writing, entry) {
1020 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1021 			break;
1022 		xfs_zone_gc_finish_chunk(chunk);
1023 	}
1024 
1025 	blk_start_plug(&plug);
1026 	list_for_each_entry_safe(chunk, next, &data->reading, entry) {
1027 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1028 			break;
1029 		xfs_zone_gc_write_chunk(chunk);
1030 	}
1031 	blk_finish_plug(&plug);
1032 
1033 	blk_start_plug(&plug);
1034 	while (xfs_zone_gc_start_chunk(data))
1035 		;
1036 	blk_finish_plug(&plug);
1037 	return true;
1038 }
1039 
1040 /*
1041  * Note that the current GC algorithm would break reflinks and thus duplicate
1042  * data that was shared by multiple owners before.  Because of that reflinks
1043  * are currently not supported on zoned file systems and can't be created or
1044  * mounted.
1045  */
1046 static int
1047 xfs_zoned_gcd(
1048 	void			*private)
1049 {
1050 	struct xfs_zone_gc_data	*data = private;
1051 	struct xfs_mount	*mp = data->mp;
1052 	struct xfs_zone_info	*zi = mp->m_zone_info;
1053 	unsigned int		nofs_flag;
1054 
1055 	nofs_flag = memalloc_nofs_save();
1056 	set_freezable();
1057 
1058 	for (;;) {
1059 		set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1060 		xfs_set_zonegc_running(mp);
1061 		if (xfs_zone_gc_handle_work(data))
1062 			continue;
1063 
1064 		if (list_empty(&data->reading) &&
1065 		    list_empty(&data->writing) &&
1066 		    list_empty(&data->resetting) &&
1067 		    !zi->zi_reset_list) {
1068 			xfs_clear_zonegc_running(mp);
1069 			xfs_zoned_resv_wake_all(mp);
1070 
1071 			if (kthread_should_stop()) {
1072 				__set_current_state(TASK_RUNNING);
1073 				break;
1074 			}
1075 
1076 			if (kthread_should_park()) {
1077 				__set_current_state(TASK_RUNNING);
1078 				kthread_parkme();
1079 				continue;
1080 			}
1081 		}
1082 
1083 		schedule();
1084 	}
1085 	xfs_clear_zonegc_running(mp);
1086 
1087 	if (data->iter.victim_rtg)
1088 		xfs_rtgroup_rele(data->iter.victim_rtg);
1089 
1090 	memalloc_nofs_restore(nofs_flag);
1091 	xfs_zone_gc_data_free(data);
1092 	return 0;
1093 }
1094 
1095 void
1096 xfs_zone_gc_start(
1097 	struct xfs_mount	*mp)
1098 {
1099 	if (xfs_has_zoned(mp))
1100 		kthread_unpark(mp->m_zone_info->zi_gc_thread);
1101 }
1102 
1103 void
1104 xfs_zone_gc_stop(
1105 	struct xfs_mount	*mp)
1106 {
1107 	if (xfs_has_zoned(mp))
1108 		kthread_park(mp->m_zone_info->zi_gc_thread);
1109 }
1110 
1111 int
1112 xfs_zone_gc_mount(
1113 	struct xfs_mount	*mp)
1114 {
1115 	struct xfs_zone_info	*zi = mp->m_zone_info;
1116 	struct xfs_zone_gc_data	*data;
1117 	struct xfs_open_zone	*oz;
1118 	int			error;
1119 
1120 	/*
1121 	 * If there are no free zones available for GC, pick the open zone with
1122 	 * the least used space to GC into.  This should only happen after an
1123 	 * unclean shutdown near ENOSPC while GC was ongoing.
1124 	 *
1125 	 * We also need to do this for the first gc zone allocation if we
1126 	 * unmounted while at the open limit.
1127 	 */
1128 	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
1129 	    zi->zi_nr_open_zones == mp->m_max_open_zones)
1130 		oz = xfs_zone_gc_steal_open(zi);
1131 	else
1132 		oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
1133 	if (!oz) {
1134 		xfs_warn(mp, "unable to allocate a zone for gc");
1135 		error = -EIO;
1136 		goto out;
1137 	}
1138 
1139 	trace_xfs_zone_gc_target_opened(oz->oz_rtg);
1140 	zi->zi_open_gc_zone = oz;
1141 
1142 	data = xfs_zone_gc_data_alloc(mp);
1143 	if (!data) {
1144 		error = -ENOMEM;
1145 		goto out_put_gc_zone;
1146 	}
1147 
1148 	mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
1149 			"xfs-zone-gc/%s", mp->m_super->s_id);
1150 	if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
1151 		xfs_warn(mp, "unable to create zone gc thread");
1152 		error = PTR_ERR(mp->m_zone_info->zi_gc_thread);
1153 		goto out_free_gc_data;
1154 	}
1155 
1156 	/* xfs_zone_gc_start will unpark for rw mounts */
1157 	kthread_park(mp->m_zone_info->zi_gc_thread);
1158 	return 0;
1159 
1160 out_free_gc_data:
1161 	kfree(data);
1162 out_put_gc_zone:
1163 	xfs_open_zone_put(zi->zi_open_gc_zone);
1164 out:
1165 	return error;
1166 }
1167 
1168 void
1169 xfs_zone_gc_unmount(
1170 	struct xfs_mount	*mp)
1171 {
1172 	struct xfs_zone_info	*zi = mp->m_zone_info;
1173 
1174 	kthread_stop(zi->zi_gc_thread);
1175 	if (zi->zi_open_gc_zone)
1176 		xfs_open_zone_put(zi->zi_open_gc_zone);
1177 }
1178