xref: /linux/fs/xfs/xfs_zone_gc.c (revision c148bc7535650fbfa95a1f571b9ffa2ab478ea33)
1080d01c4SChristoph Hellwig // SPDX-License-Identifier: GPL-2.0
2080d01c4SChristoph Hellwig /*
3080d01c4SChristoph Hellwig  * Copyright (c) 2023-2025 Christoph Hellwig.
4080d01c4SChristoph Hellwig  * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5080d01c4SChristoph Hellwig  */
6080d01c4SChristoph Hellwig #include "xfs.h"
7080d01c4SChristoph Hellwig #include "xfs_shared.h"
8080d01c4SChristoph Hellwig #include "xfs_format.h"
9080d01c4SChristoph Hellwig #include "xfs_log_format.h"
10080d01c4SChristoph Hellwig #include "xfs_trans_resv.h"
11080d01c4SChristoph Hellwig #include "xfs_mount.h"
12080d01c4SChristoph Hellwig #include "xfs_inode.h"
13080d01c4SChristoph Hellwig #include "xfs_btree.h"
14080d01c4SChristoph Hellwig #include "xfs_trans.h"
15080d01c4SChristoph Hellwig #include "xfs_icache.h"
16080d01c4SChristoph Hellwig #include "xfs_rmap.h"
17080d01c4SChristoph Hellwig #include "xfs_rtbitmap.h"
18080d01c4SChristoph Hellwig #include "xfs_rtrmap_btree.h"
19080d01c4SChristoph Hellwig #include "xfs_zone_alloc.h"
20080d01c4SChristoph Hellwig #include "xfs_zone_priv.h"
21080d01c4SChristoph Hellwig #include "xfs_zones.h"
22080d01c4SChristoph Hellwig #include "xfs_trace.h"
23080d01c4SChristoph Hellwig 
24080d01c4SChristoph Hellwig /*
25080d01c4SChristoph Hellwig  * Implement Garbage Collection (GC) of partially used zoned.
26080d01c4SChristoph Hellwig  *
27080d01c4SChristoph Hellwig  * To support the purely sequential writes in each zone, zoned XFS needs to be
28080d01c4SChristoph Hellwig  * able to move data remaining in a zone out of it to reset the zone to prepare
29080d01c4SChristoph Hellwig  * for writing to it again.
30080d01c4SChristoph Hellwig  *
31080d01c4SChristoph Hellwig  * This is done by the GC thread implemented in this file.  To support that a
32080d01c4SChristoph Hellwig  * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
33080d01c4SChristoph Hellwig  * write the garbage collected data into.
34080d01c4SChristoph Hellwig  *
35080d01c4SChristoph Hellwig  * Whenever the available space is below the chosen threshold, the GC thread
36080d01c4SChristoph Hellwig  * looks for potential non-empty but not fully used zones that are worth
37080d01c4SChristoph Hellwig  * reclaiming.  Once found the rmap for the victim zone is queried, and after
38080d01c4SChristoph Hellwig  * a bit of sorting to reduce fragmentation, the still live extents are read
39080d01c4SChristoph Hellwig  * into memory and written to the GC target zone, and the bmap btree of the
40080d01c4SChristoph Hellwig  * files is updated to point to the new location.  To avoid taking the IOLOCK
41080d01c4SChristoph Hellwig  * and MMAPLOCK for the entire GC process and thus affecting the latency of
42080d01c4SChristoph Hellwig  * user reads and writes to the files, the GC writes are speculative and the
43080d01c4SChristoph Hellwig  * I/O completion checks that no other writes happened for the affected regions
44080d01c4SChristoph Hellwig  * before remapping.
45080d01c4SChristoph Hellwig  *
46080d01c4SChristoph Hellwig  * Once a zone does not contain any valid data, be that through GC or user
47080d01c4SChristoph Hellwig  * block removal, it is queued for for a zone reset.  The reset operation
48080d01c4SChristoph Hellwig  * carefully ensures that the RT device cache is flushed and all transactions
49080d01c4SChristoph Hellwig  * referencing the rmap have been committed to disk.
50080d01c4SChristoph Hellwig  */
51080d01c4SChristoph Hellwig 
52080d01c4SChristoph Hellwig /*
53080d01c4SChristoph Hellwig  * Size of each GC scratch pad.  This is also the upper bound for each
54080d01c4SChristoph Hellwig  * GC I/O, which helps to keep latency down.
55080d01c4SChristoph Hellwig  */
56080d01c4SChristoph Hellwig #define XFS_GC_CHUNK_SIZE	SZ_1M
57080d01c4SChristoph Hellwig 
58080d01c4SChristoph Hellwig /*
59080d01c4SChristoph Hellwig  * Scratchpad data to read GCed data into.
60080d01c4SChristoph Hellwig  *
61080d01c4SChristoph Hellwig  * The offset member tracks where the next allocation starts, and freed tracks
62080d01c4SChristoph Hellwig  * the amount of space that is not used anymore.
63080d01c4SChristoph Hellwig  */
64080d01c4SChristoph Hellwig #define XFS_ZONE_GC_NR_SCRATCH	2
65080d01c4SChristoph Hellwig struct xfs_zone_scratch {
66080d01c4SChristoph Hellwig 	struct folio			*folio;
67080d01c4SChristoph Hellwig 	unsigned int			offset;
68080d01c4SChristoph Hellwig 	unsigned int			freed;
69080d01c4SChristoph Hellwig };
70080d01c4SChristoph Hellwig 
71080d01c4SChristoph Hellwig /*
72080d01c4SChristoph Hellwig  * Chunk that is read and written for each GC operation.
73080d01c4SChristoph Hellwig  *
74080d01c4SChristoph Hellwig  * Note that for writes to actual zoned devices, the chunk can be split when
75080d01c4SChristoph Hellwig  * reaching the hardware limit.
76080d01c4SChristoph Hellwig  */
77080d01c4SChristoph Hellwig struct xfs_gc_bio {
78080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data		*data;
79080d01c4SChristoph Hellwig 
80080d01c4SChristoph Hellwig 	/*
81080d01c4SChristoph Hellwig 	 * Entry into the reading/writing/resetting list.  Only accessed from
82080d01c4SChristoph Hellwig 	 * the GC thread, so no locking needed.
83080d01c4SChristoph Hellwig 	 */
84080d01c4SChristoph Hellwig 	struct list_head		entry;
85080d01c4SChristoph Hellwig 
86080d01c4SChristoph Hellwig 	/*
87080d01c4SChristoph Hellwig 	 * State of this gc_bio.  Done means the current I/O completed.
88080d01c4SChristoph Hellwig 	 * Set from the bio end I/O handler, read from the GC thread.
89080d01c4SChristoph Hellwig 	 */
90080d01c4SChristoph Hellwig 	enum {
91080d01c4SChristoph Hellwig 		XFS_GC_BIO_NEW,
92080d01c4SChristoph Hellwig 		XFS_GC_BIO_DONE,
93080d01c4SChristoph Hellwig 	} state;
94080d01c4SChristoph Hellwig 
95080d01c4SChristoph Hellwig 	/*
96080d01c4SChristoph Hellwig 	 * Pointer to the inode and byte range in the inode that this
97080d01c4SChristoph Hellwig 	 * GC chunk is operating on.
98080d01c4SChristoph Hellwig 	 */
99080d01c4SChristoph Hellwig 	struct xfs_inode		*ip;
100080d01c4SChristoph Hellwig 	loff_t				offset;
101080d01c4SChristoph Hellwig 	unsigned int			len;
102080d01c4SChristoph Hellwig 
103080d01c4SChristoph Hellwig 	/*
104080d01c4SChristoph Hellwig 	 * Existing startblock (in the zone to be freed) and newly assigned
105080d01c4SChristoph Hellwig 	 * daddr in the zone GCed into.
106080d01c4SChristoph Hellwig 	 */
107080d01c4SChristoph Hellwig 	xfs_fsblock_t			old_startblock;
108080d01c4SChristoph Hellwig 	xfs_daddr_t			new_daddr;
109080d01c4SChristoph Hellwig 	struct xfs_zone_scratch		*scratch;
110080d01c4SChristoph Hellwig 
111080d01c4SChristoph Hellwig 	/* Are we writing to a sequential write required zone? */
112080d01c4SChristoph Hellwig 	bool				is_seq;
113080d01c4SChristoph Hellwig 
114080d01c4SChristoph Hellwig 	/* Open Zone being written to */
115080d01c4SChristoph Hellwig 	struct xfs_open_zone		*oz;
116080d01c4SChristoph Hellwig 
117080d01c4SChristoph Hellwig 	/* Bio used for reads and writes, including the bvec used by it */
118080d01c4SChristoph Hellwig 	struct bio_vec			bv;
119080d01c4SChristoph Hellwig 	struct bio			bio;	/* must be last */
120080d01c4SChristoph Hellwig };
121080d01c4SChristoph Hellwig 
122080d01c4SChristoph Hellwig #define XFS_ZONE_GC_RECS		1024
123080d01c4SChristoph Hellwig 
124080d01c4SChristoph Hellwig /* iterator, needs to be reinitialized for each victim zone */
125080d01c4SChristoph Hellwig struct xfs_zone_gc_iter {
126080d01c4SChristoph Hellwig 	struct xfs_rtgroup		*victim_rtg;
127080d01c4SChristoph Hellwig 	unsigned int			rec_count;
128080d01c4SChristoph Hellwig 	unsigned int			rec_idx;
129080d01c4SChristoph Hellwig 	xfs_agblock_t			next_startblock;
130080d01c4SChristoph Hellwig 	struct xfs_rmap_irec		*recs;
131080d01c4SChristoph Hellwig };
132080d01c4SChristoph Hellwig 
133080d01c4SChristoph Hellwig /*
134080d01c4SChristoph Hellwig  * Per-mount GC state.
135080d01c4SChristoph Hellwig  */
136080d01c4SChristoph Hellwig struct xfs_zone_gc_data {
137080d01c4SChristoph Hellwig 	struct xfs_mount		*mp;
138080d01c4SChristoph Hellwig 
139080d01c4SChristoph Hellwig 	/* bioset used to allocate the gc_bios */
140080d01c4SChristoph Hellwig 	struct bio_set			bio_set;
141080d01c4SChristoph Hellwig 
142080d01c4SChristoph Hellwig 	/*
143080d01c4SChristoph Hellwig 	 * Scratchpad used, and index to indicated which one is used.
144080d01c4SChristoph Hellwig 	 */
145080d01c4SChristoph Hellwig 	struct xfs_zone_scratch		scratch[XFS_ZONE_GC_NR_SCRATCH];
146080d01c4SChristoph Hellwig 	unsigned int			scratch_idx;
147080d01c4SChristoph Hellwig 
148080d01c4SChristoph Hellwig 	/*
149080d01c4SChristoph Hellwig 	 * List of bios currently being read, written and reset.
150080d01c4SChristoph Hellwig 	 * These lists are only accessed by the GC thread itself, and must only
151080d01c4SChristoph Hellwig 	 * be processed in order.
152080d01c4SChristoph Hellwig 	 */
153080d01c4SChristoph Hellwig 	struct list_head		reading;
154080d01c4SChristoph Hellwig 	struct list_head		writing;
155080d01c4SChristoph Hellwig 	struct list_head		resetting;
156080d01c4SChristoph Hellwig 
157080d01c4SChristoph Hellwig 	/*
158080d01c4SChristoph Hellwig 	 * Iterator for the victim zone.
159080d01c4SChristoph Hellwig 	 */
160080d01c4SChristoph Hellwig 	struct xfs_zone_gc_iter		iter;
161080d01c4SChristoph Hellwig };
162080d01c4SChristoph Hellwig 
163080d01c4SChristoph Hellwig /*
164080d01c4SChristoph Hellwig  * We aim to keep enough zones free in stock to fully use the open zone limit
165080d01c4SChristoph Hellwig  * for data placement purposes.
166080d01c4SChristoph Hellwig  */
167080d01c4SChristoph Hellwig bool
xfs_zoned_need_gc(struct xfs_mount * mp)168080d01c4SChristoph Hellwig xfs_zoned_need_gc(
169080d01c4SChristoph Hellwig 	struct xfs_mount	*mp)
170080d01c4SChristoph Hellwig {
171080d01c4SChristoph Hellwig 	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
172080d01c4SChristoph Hellwig 		return false;
173080d01c4SChristoph Hellwig 	if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) <
174080d01c4SChristoph Hellwig 	    mp->m_groups[XG_TYPE_RTG].blocks *
175080d01c4SChristoph Hellwig 	    (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
176080d01c4SChristoph Hellwig 		return true;
177080d01c4SChristoph Hellwig 	return false;
178080d01c4SChristoph Hellwig }
179080d01c4SChristoph Hellwig 
180080d01c4SChristoph Hellwig static struct xfs_zone_gc_data *
xfs_zone_gc_data_alloc(struct xfs_mount * mp)181080d01c4SChristoph Hellwig xfs_zone_gc_data_alloc(
182080d01c4SChristoph Hellwig 	struct xfs_mount	*mp)
183080d01c4SChristoph Hellwig {
184080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data;
185080d01c4SChristoph Hellwig 	int			i;
186080d01c4SChristoph Hellwig 
187080d01c4SChristoph Hellwig 	data = kzalloc(sizeof(*data), GFP_KERNEL);
188080d01c4SChristoph Hellwig 	if (!data)
189080d01c4SChristoph Hellwig 		return NULL;
190080d01c4SChristoph Hellwig 	data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
191080d01c4SChristoph Hellwig 			GFP_KERNEL);
192080d01c4SChristoph Hellwig 	if (!data->iter.recs)
193080d01c4SChristoph Hellwig 		goto out_free_data;
194080d01c4SChristoph Hellwig 
195080d01c4SChristoph Hellwig 	/*
196080d01c4SChristoph Hellwig 	 * We actually only need a single bio_vec.  It would be nice to have
197080d01c4SChristoph Hellwig 	 * a flag that only allocates the inline bvecs and not the separate
198080d01c4SChristoph Hellwig 	 * bvec pool.
199080d01c4SChristoph Hellwig 	 */
200080d01c4SChristoph Hellwig 	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
201080d01c4SChristoph Hellwig 			BIOSET_NEED_BVECS))
202080d01c4SChristoph Hellwig 		goto out_free_recs;
203080d01c4SChristoph Hellwig 	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
204080d01c4SChristoph Hellwig 		data->scratch[i].folio =
205080d01c4SChristoph Hellwig 			folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
206080d01c4SChristoph Hellwig 		if (!data->scratch[i].folio)
207080d01c4SChristoph Hellwig 			goto out_free_scratch;
208080d01c4SChristoph Hellwig 	}
209080d01c4SChristoph Hellwig 	INIT_LIST_HEAD(&data->reading);
210080d01c4SChristoph Hellwig 	INIT_LIST_HEAD(&data->writing);
211080d01c4SChristoph Hellwig 	INIT_LIST_HEAD(&data->resetting);
212080d01c4SChristoph Hellwig 	data->mp = mp;
213080d01c4SChristoph Hellwig 	return data;
214080d01c4SChristoph Hellwig 
215080d01c4SChristoph Hellwig out_free_scratch:
216080d01c4SChristoph Hellwig 	while (--i >= 0)
217080d01c4SChristoph Hellwig 		folio_put(data->scratch[i].folio);
218080d01c4SChristoph Hellwig 	bioset_exit(&data->bio_set);
219080d01c4SChristoph Hellwig out_free_recs:
220080d01c4SChristoph Hellwig 	kfree(data->iter.recs);
221080d01c4SChristoph Hellwig out_free_data:
222080d01c4SChristoph Hellwig 	kfree(data);
223080d01c4SChristoph Hellwig 	return NULL;
224080d01c4SChristoph Hellwig }
225080d01c4SChristoph Hellwig 
226080d01c4SChristoph Hellwig static void
xfs_zone_gc_data_free(struct xfs_zone_gc_data * data)227080d01c4SChristoph Hellwig xfs_zone_gc_data_free(
228080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data)
229080d01c4SChristoph Hellwig {
230080d01c4SChristoph Hellwig 	int			i;
231080d01c4SChristoph Hellwig 
232080d01c4SChristoph Hellwig 	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
233080d01c4SChristoph Hellwig 		folio_put(data->scratch[i].folio);
234080d01c4SChristoph Hellwig 	bioset_exit(&data->bio_set);
235080d01c4SChristoph Hellwig 	kfree(data->iter.recs);
236080d01c4SChristoph Hellwig 	kfree(data);
237080d01c4SChristoph Hellwig }
238080d01c4SChristoph Hellwig 
239080d01c4SChristoph Hellwig static void
xfs_zone_gc_iter_init(struct xfs_zone_gc_iter * iter,struct xfs_rtgroup * victim_rtg)240080d01c4SChristoph Hellwig xfs_zone_gc_iter_init(
241080d01c4SChristoph Hellwig 	struct xfs_zone_gc_iter	*iter,
242080d01c4SChristoph Hellwig 	struct xfs_rtgroup	*victim_rtg)
243080d01c4SChristoph Hellwig 
244080d01c4SChristoph Hellwig {
245080d01c4SChristoph Hellwig 	iter->next_startblock = 0;
246080d01c4SChristoph Hellwig 	iter->rec_count = 0;
247080d01c4SChristoph Hellwig 	iter->rec_idx = 0;
248080d01c4SChristoph Hellwig 	iter->victim_rtg = victim_rtg;
249080d01c4SChristoph Hellwig }
250080d01c4SChristoph Hellwig 
251080d01c4SChristoph Hellwig /*
252080d01c4SChristoph Hellwig  * Query the rmap of the victim zone to gather the records to evacuate.
253080d01c4SChristoph Hellwig  */
254080d01c4SChristoph Hellwig static int
xfs_zone_gc_query_cb(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * irec,void * private)255080d01c4SChristoph Hellwig xfs_zone_gc_query_cb(
256080d01c4SChristoph Hellwig 	struct xfs_btree_cur	*cur,
257080d01c4SChristoph Hellwig 	const struct xfs_rmap_irec *irec,
258080d01c4SChristoph Hellwig 	void			*private)
259080d01c4SChristoph Hellwig {
260080d01c4SChristoph Hellwig 	struct xfs_zone_gc_iter	*iter = private;
261080d01c4SChristoph Hellwig 
262080d01c4SChristoph Hellwig 	ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
263080d01c4SChristoph Hellwig 	ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
264080d01c4SChristoph Hellwig 	ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
265080d01c4SChristoph Hellwig 
266080d01c4SChristoph Hellwig 	iter->recs[iter->rec_count] = *irec;
267080d01c4SChristoph Hellwig 	if (++iter->rec_count == XFS_ZONE_GC_RECS) {
268080d01c4SChristoph Hellwig 		iter->next_startblock =
269080d01c4SChristoph Hellwig 			irec->rm_startblock + irec->rm_blockcount;
270080d01c4SChristoph Hellwig 		return 1;
271080d01c4SChristoph Hellwig 	}
272080d01c4SChristoph Hellwig 	return 0;
273080d01c4SChristoph Hellwig }
274080d01c4SChristoph Hellwig 
275080d01c4SChristoph Hellwig #define cmp_int(l, r)		((l > r) - (l < r))
276080d01c4SChristoph Hellwig 
277080d01c4SChristoph Hellwig static int
xfs_zone_gc_rmap_rec_cmp(const void * a,const void * b)278080d01c4SChristoph Hellwig xfs_zone_gc_rmap_rec_cmp(
279080d01c4SChristoph Hellwig 	const void			*a,
280080d01c4SChristoph Hellwig 	const void			*b)
281080d01c4SChristoph Hellwig {
282080d01c4SChristoph Hellwig 	const struct xfs_rmap_irec	*reca = a;
283080d01c4SChristoph Hellwig 	const struct xfs_rmap_irec	*recb = b;
284080d01c4SChristoph Hellwig 	int				diff;
285080d01c4SChristoph Hellwig 
286080d01c4SChristoph Hellwig 	diff = cmp_int(reca->rm_owner, recb->rm_owner);
287080d01c4SChristoph Hellwig 	if (diff)
288080d01c4SChristoph Hellwig 		return diff;
289080d01c4SChristoph Hellwig 	return cmp_int(reca->rm_offset, recb->rm_offset);
290080d01c4SChristoph Hellwig }
291080d01c4SChristoph Hellwig 
292080d01c4SChristoph Hellwig static int
xfs_zone_gc_query(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter)293080d01c4SChristoph Hellwig xfs_zone_gc_query(
294080d01c4SChristoph Hellwig 	struct xfs_mount	*mp,
295080d01c4SChristoph Hellwig 	struct xfs_zone_gc_iter	*iter)
296080d01c4SChristoph Hellwig {
297080d01c4SChristoph Hellwig 	struct xfs_rtgroup	*rtg = iter->victim_rtg;
298080d01c4SChristoph Hellwig 	struct xfs_rmap_irec	ri_low = { };
299080d01c4SChristoph Hellwig 	struct xfs_rmap_irec	ri_high;
300080d01c4SChristoph Hellwig 	struct xfs_btree_cur	*cur;
301080d01c4SChristoph Hellwig 	struct xfs_trans	*tp;
302080d01c4SChristoph Hellwig 	int			error;
303080d01c4SChristoph Hellwig 
304080d01c4SChristoph Hellwig 	ASSERT(iter->next_startblock <= rtg_blocks(rtg));
305080d01c4SChristoph Hellwig 	if (iter->next_startblock == rtg_blocks(rtg))
306080d01c4SChristoph Hellwig 		goto done;
307080d01c4SChristoph Hellwig 
308080d01c4SChristoph Hellwig 	ASSERT(iter->next_startblock < rtg_blocks(rtg));
309080d01c4SChristoph Hellwig 	ri_low.rm_startblock = iter->next_startblock;
310080d01c4SChristoph Hellwig 	memset(&ri_high, 0xFF, sizeof(ri_high));
311080d01c4SChristoph Hellwig 
312080d01c4SChristoph Hellwig 	iter->rec_idx = 0;
313080d01c4SChristoph Hellwig 	iter->rec_count = 0;
314080d01c4SChristoph Hellwig 
315080d01c4SChristoph Hellwig 	error = xfs_trans_alloc_empty(mp, &tp);
316080d01c4SChristoph Hellwig 	if (error)
317080d01c4SChristoph Hellwig 		return error;
318080d01c4SChristoph Hellwig 
319080d01c4SChristoph Hellwig 	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
320080d01c4SChristoph Hellwig 	cur = xfs_rtrmapbt_init_cursor(tp, rtg);
321080d01c4SChristoph Hellwig 	error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
322080d01c4SChristoph Hellwig 			xfs_zone_gc_query_cb, iter);
323080d01c4SChristoph Hellwig 	xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
324080d01c4SChristoph Hellwig 	xfs_btree_del_cursor(cur, error < 0 ? error : 0);
325080d01c4SChristoph Hellwig 	xfs_trans_cancel(tp);
326080d01c4SChristoph Hellwig 
327080d01c4SChristoph Hellwig 	if (error < 0)
328080d01c4SChristoph Hellwig 		return error;
329080d01c4SChristoph Hellwig 
330080d01c4SChristoph Hellwig 	/*
331080d01c4SChristoph Hellwig 	 * Sort the rmap records by inode number and increasing offset to
332080d01c4SChristoph Hellwig 	 * defragment the mappings.
333080d01c4SChristoph Hellwig 	 *
334080d01c4SChristoph Hellwig 	 * This could be further enhanced by an even bigger look ahead window,
335080d01c4SChristoph Hellwig 	 * but that's better left until we have better detection of changes to
336080d01c4SChristoph Hellwig 	 * inode mapping to avoid the potential of GCing already dead data.
337080d01c4SChristoph Hellwig 	 */
338080d01c4SChristoph Hellwig 	sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
339080d01c4SChristoph Hellwig 			xfs_zone_gc_rmap_rec_cmp, NULL);
340080d01c4SChristoph Hellwig 
341080d01c4SChristoph Hellwig 	if (error == 0) {
342080d01c4SChristoph Hellwig 		/*
343080d01c4SChristoph Hellwig 		 * We finished iterating through the zone.
344080d01c4SChristoph Hellwig 		 */
345080d01c4SChristoph Hellwig 		iter->next_startblock = rtg_blocks(rtg);
346080d01c4SChristoph Hellwig 		if (iter->rec_count == 0)
347080d01c4SChristoph Hellwig 			goto done;
348080d01c4SChristoph Hellwig 	}
349080d01c4SChristoph Hellwig 
350080d01c4SChristoph Hellwig 	return 0;
351080d01c4SChristoph Hellwig done:
352080d01c4SChristoph Hellwig 	xfs_rtgroup_rele(iter->victim_rtg);
353080d01c4SChristoph Hellwig 	iter->victim_rtg = NULL;
354080d01c4SChristoph Hellwig 	return 0;
355080d01c4SChristoph Hellwig }
356080d01c4SChristoph Hellwig 
357080d01c4SChristoph Hellwig static bool
xfs_zone_gc_iter_next(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter,struct xfs_rmap_irec * chunk_rec,struct xfs_inode ** ipp)358080d01c4SChristoph Hellwig xfs_zone_gc_iter_next(
359080d01c4SChristoph Hellwig 	struct xfs_mount	*mp,
360080d01c4SChristoph Hellwig 	struct xfs_zone_gc_iter	*iter,
361080d01c4SChristoph Hellwig 	struct xfs_rmap_irec	*chunk_rec,
362080d01c4SChristoph Hellwig 	struct xfs_inode	**ipp)
363080d01c4SChristoph Hellwig {
364080d01c4SChristoph Hellwig 	struct xfs_rmap_irec	*irec;
365080d01c4SChristoph Hellwig 	int			error;
366080d01c4SChristoph Hellwig 
367080d01c4SChristoph Hellwig 	if (!iter->victim_rtg)
368080d01c4SChristoph Hellwig 		return false;
369080d01c4SChristoph Hellwig 
370080d01c4SChristoph Hellwig retry:
371080d01c4SChristoph Hellwig 	if (iter->rec_idx == iter->rec_count) {
372080d01c4SChristoph Hellwig 		error = xfs_zone_gc_query(mp, iter);
373080d01c4SChristoph Hellwig 		if (error)
374080d01c4SChristoph Hellwig 			goto fail;
375080d01c4SChristoph Hellwig 		if (!iter->victim_rtg)
376080d01c4SChristoph Hellwig 			return false;
377080d01c4SChristoph Hellwig 	}
378080d01c4SChristoph Hellwig 
379080d01c4SChristoph Hellwig 	irec = &iter->recs[iter->rec_idx];
380080d01c4SChristoph Hellwig 	error = xfs_iget(mp, NULL, irec->rm_owner,
381080d01c4SChristoph Hellwig 			XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
382080d01c4SChristoph Hellwig 	if (error) {
383080d01c4SChristoph Hellwig 		/*
384080d01c4SChristoph Hellwig 		 * If the inode was already deleted, skip over it.
385080d01c4SChristoph Hellwig 		 */
386080d01c4SChristoph Hellwig 		if (error == -ENOENT) {
387080d01c4SChristoph Hellwig 			iter->rec_idx++;
388080d01c4SChristoph Hellwig 			goto retry;
389080d01c4SChristoph Hellwig 		}
390080d01c4SChristoph Hellwig 		goto fail;
391080d01c4SChristoph Hellwig 	}
392080d01c4SChristoph Hellwig 
393080d01c4SChristoph Hellwig 	if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
394080d01c4SChristoph Hellwig 		iter->rec_idx++;
395080d01c4SChristoph Hellwig 		xfs_irele(*ipp);
396080d01c4SChristoph Hellwig 		goto retry;
397080d01c4SChristoph Hellwig 	}
398080d01c4SChristoph Hellwig 
399080d01c4SChristoph Hellwig 	*chunk_rec = *irec;
400080d01c4SChristoph Hellwig 	return true;
401080d01c4SChristoph Hellwig 
402080d01c4SChristoph Hellwig fail:
403080d01c4SChristoph Hellwig 	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
404080d01c4SChristoph Hellwig 	return false;
405080d01c4SChristoph Hellwig }
406080d01c4SChristoph Hellwig 
407080d01c4SChristoph Hellwig static void
xfs_zone_gc_iter_advance(struct xfs_zone_gc_iter * iter,xfs_extlen_t count_fsb)408080d01c4SChristoph Hellwig xfs_zone_gc_iter_advance(
409080d01c4SChristoph Hellwig 	struct xfs_zone_gc_iter	*iter,
410080d01c4SChristoph Hellwig 	xfs_extlen_t		count_fsb)
411080d01c4SChristoph Hellwig {
412080d01c4SChristoph Hellwig 	struct xfs_rmap_irec	*irec = &iter->recs[iter->rec_idx];
413080d01c4SChristoph Hellwig 
414080d01c4SChristoph Hellwig 	irec->rm_offset += count_fsb;
415080d01c4SChristoph Hellwig 	irec->rm_startblock += count_fsb;
416080d01c4SChristoph Hellwig 	irec->rm_blockcount -= count_fsb;
417080d01c4SChristoph Hellwig 	if (!irec->rm_blockcount)
418080d01c4SChristoph Hellwig 		iter->rec_idx++;
419080d01c4SChristoph Hellwig }
420080d01c4SChristoph Hellwig 
421080d01c4SChristoph Hellwig static struct xfs_rtgroup *
xfs_zone_gc_pick_victim_from(struct xfs_mount * mp,uint32_t bucket)422080d01c4SChristoph Hellwig xfs_zone_gc_pick_victim_from(
423080d01c4SChristoph Hellwig 	struct xfs_mount	*mp,
424080d01c4SChristoph Hellwig 	uint32_t		bucket)
425080d01c4SChristoph Hellwig {
426080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi = mp->m_zone_info;
427080d01c4SChristoph Hellwig 	uint32_t		victim_used = U32_MAX;
428080d01c4SChristoph Hellwig 	struct xfs_rtgroup	*victim_rtg = NULL;
429080d01c4SChristoph Hellwig 	uint32_t		bit;
430080d01c4SChristoph Hellwig 
431080d01c4SChristoph Hellwig 	if (!zi->zi_used_bucket_entries[bucket])
432080d01c4SChristoph Hellwig 		return NULL;
433080d01c4SChristoph Hellwig 
434080d01c4SChristoph Hellwig 	for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
435080d01c4SChristoph Hellwig 			mp->m_sb.sb_rgcount) {
436080d01c4SChristoph Hellwig 		struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
437080d01c4SChristoph Hellwig 
438080d01c4SChristoph Hellwig 		if (!rtg)
439080d01c4SChristoph Hellwig 			continue;
440080d01c4SChristoph Hellwig 
441080d01c4SChristoph Hellwig 		/* skip zones that are just waiting for a reset */
442080d01c4SChristoph Hellwig 		if (rtg_rmap(rtg)->i_used_blocks == 0 ||
443080d01c4SChristoph Hellwig 		    rtg_rmap(rtg)->i_used_blocks >= victim_used) {
444080d01c4SChristoph Hellwig 			xfs_rtgroup_rele(rtg);
445080d01c4SChristoph Hellwig 			continue;
446080d01c4SChristoph Hellwig 		}
447080d01c4SChristoph Hellwig 
448080d01c4SChristoph Hellwig 		if (victim_rtg)
449080d01c4SChristoph Hellwig 			xfs_rtgroup_rele(victim_rtg);
450080d01c4SChristoph Hellwig 		victim_rtg = rtg;
451080d01c4SChristoph Hellwig 		victim_used = rtg_rmap(rtg)->i_used_blocks;
452080d01c4SChristoph Hellwig 
453080d01c4SChristoph Hellwig 		/*
454080d01c4SChristoph Hellwig 		 * Any zone that is less than 1 percent used is fair game for
455080d01c4SChristoph Hellwig 		 * instant reclaim. All of these zones are in the last
456080d01c4SChristoph Hellwig 		 * bucket, so avoid the expensive division for the zones
457080d01c4SChristoph Hellwig 		 * in the other buckets.
458080d01c4SChristoph Hellwig 		 */
459080d01c4SChristoph Hellwig 		if (bucket == 0 &&
460080d01c4SChristoph Hellwig 		    rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
461080d01c4SChristoph Hellwig 			break;
462080d01c4SChristoph Hellwig 	}
463080d01c4SChristoph Hellwig 
464080d01c4SChristoph Hellwig 	return victim_rtg;
465080d01c4SChristoph Hellwig }
466080d01c4SChristoph Hellwig 
467080d01c4SChristoph Hellwig /*
468080d01c4SChristoph Hellwig  * Iterate through all zones marked as reclaimable and find a candidate to
469080d01c4SChristoph Hellwig  * reclaim.
470080d01c4SChristoph Hellwig  */
471080d01c4SChristoph Hellwig static bool
xfs_zone_gc_select_victim(struct xfs_zone_gc_data * data)472080d01c4SChristoph Hellwig xfs_zone_gc_select_victim(
473080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data)
474080d01c4SChristoph Hellwig {
475080d01c4SChristoph Hellwig 	struct xfs_zone_gc_iter	*iter = &data->iter;
476080d01c4SChristoph Hellwig 	struct xfs_mount	*mp = data->mp;
477080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi = mp->m_zone_info;
478080d01c4SChristoph Hellwig 	struct xfs_rtgroup	*victim_rtg = NULL;
479080d01c4SChristoph Hellwig 	unsigned int		bucket;
480080d01c4SChristoph Hellwig 
481080d01c4SChristoph Hellwig 	if (xfs_is_shutdown(mp))
482080d01c4SChristoph Hellwig 		return false;
483080d01c4SChristoph Hellwig 
484080d01c4SChristoph Hellwig 	if (iter->victim_rtg)
485080d01c4SChristoph Hellwig 		return true;
486080d01c4SChristoph Hellwig 
487080d01c4SChristoph Hellwig 	/*
488080d01c4SChristoph Hellwig 	 * Don't start new work if we are asked to stop or park.
489080d01c4SChristoph Hellwig 	 */
490080d01c4SChristoph Hellwig 	if (kthread_should_stop() || kthread_should_park())
491080d01c4SChristoph Hellwig 		return false;
492080d01c4SChristoph Hellwig 
493080d01c4SChristoph Hellwig 	if (!xfs_zoned_need_gc(mp))
494080d01c4SChristoph Hellwig 		return false;
495080d01c4SChristoph Hellwig 
496080d01c4SChristoph Hellwig 	spin_lock(&zi->zi_used_buckets_lock);
497080d01c4SChristoph Hellwig 	for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
498080d01c4SChristoph Hellwig 		victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
499080d01c4SChristoph Hellwig 		if (victim_rtg)
500080d01c4SChristoph Hellwig 			break;
501080d01c4SChristoph Hellwig 	}
502080d01c4SChristoph Hellwig 	spin_unlock(&zi->zi_used_buckets_lock);
503080d01c4SChristoph Hellwig 
504080d01c4SChristoph Hellwig 	if (!victim_rtg)
505080d01c4SChristoph Hellwig 		return false;
506080d01c4SChristoph Hellwig 
507080d01c4SChristoph Hellwig 	trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
508080d01c4SChristoph Hellwig 	xfs_zone_gc_iter_init(iter, victim_rtg);
509080d01c4SChristoph Hellwig 	return true;
510080d01c4SChristoph Hellwig }
511080d01c4SChristoph Hellwig 
512080d01c4SChristoph Hellwig static struct xfs_open_zone *
xfs_zone_gc_steal_open(struct xfs_zone_info * zi)513080d01c4SChristoph Hellwig xfs_zone_gc_steal_open(
514080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi)
515080d01c4SChristoph Hellwig {
516080d01c4SChristoph Hellwig 	struct xfs_open_zone	*oz, *found = NULL;
517080d01c4SChristoph Hellwig 
518080d01c4SChristoph Hellwig 	spin_lock(&zi->zi_open_zones_lock);
519080d01c4SChristoph Hellwig 	list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
520080d01c4SChristoph Hellwig 		if (!found ||
521080d01c4SChristoph Hellwig 		    oz->oz_write_pointer < found->oz_write_pointer)
522080d01c4SChristoph Hellwig 			found = oz;
523080d01c4SChristoph Hellwig 	}
524080d01c4SChristoph Hellwig 
525080d01c4SChristoph Hellwig 	if (found) {
526080d01c4SChristoph Hellwig 		found->oz_is_gc = true;
527080d01c4SChristoph Hellwig 		list_del_init(&found->oz_entry);
528080d01c4SChristoph Hellwig 		zi->zi_nr_open_zones--;
529080d01c4SChristoph Hellwig 	}
530080d01c4SChristoph Hellwig 
531080d01c4SChristoph Hellwig 	spin_unlock(&zi->zi_open_zones_lock);
532080d01c4SChristoph Hellwig 	return found;
533080d01c4SChristoph Hellwig }
534080d01c4SChristoph Hellwig 
535080d01c4SChristoph Hellwig static struct xfs_open_zone *
xfs_zone_gc_select_target(struct xfs_mount * mp)536080d01c4SChristoph Hellwig xfs_zone_gc_select_target(
537080d01c4SChristoph Hellwig 	struct xfs_mount	*mp)
538080d01c4SChristoph Hellwig {
539080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi = mp->m_zone_info;
540080d01c4SChristoph Hellwig 	struct xfs_open_zone	*oz = zi->zi_open_gc_zone;
541080d01c4SChristoph Hellwig 
542080d01c4SChristoph Hellwig 	/*
543080d01c4SChristoph Hellwig 	 * We need to wait for pending writes to finish.
544080d01c4SChristoph Hellwig 	 */
545080d01c4SChristoph Hellwig 	if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
546080d01c4SChristoph Hellwig 		return NULL;
547080d01c4SChristoph Hellwig 
548080d01c4SChristoph Hellwig 	ASSERT(zi->zi_nr_open_zones <=
549080d01c4SChristoph Hellwig 		mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
550*64d03611SHans Holmberg 	oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
551080d01c4SChristoph Hellwig 	if (oz)
552080d01c4SChristoph Hellwig 		trace_xfs_zone_gc_target_opened(oz->oz_rtg);
553080d01c4SChristoph Hellwig 	spin_lock(&zi->zi_open_zones_lock);
554080d01c4SChristoph Hellwig 	zi->zi_open_gc_zone = oz;
555080d01c4SChristoph Hellwig 	spin_unlock(&zi->zi_open_zones_lock);
556080d01c4SChristoph Hellwig 	return oz;
557080d01c4SChristoph Hellwig }
558080d01c4SChristoph Hellwig 
559080d01c4SChristoph Hellwig /*
560080d01c4SChristoph Hellwig  * Ensure we have a valid open zone to write the GC data to.
561080d01c4SChristoph Hellwig  *
562080d01c4SChristoph Hellwig  * If the current target zone has space keep writing to it, else first wait for
563080d01c4SChristoph Hellwig  * all pending writes and then pick a new one.
564080d01c4SChristoph Hellwig  */
565080d01c4SChristoph Hellwig static struct xfs_open_zone *
xfs_zone_gc_ensure_target(struct xfs_mount * mp)566080d01c4SChristoph Hellwig xfs_zone_gc_ensure_target(
567080d01c4SChristoph Hellwig 	struct xfs_mount	*mp)
568080d01c4SChristoph Hellwig {
569080d01c4SChristoph Hellwig 	struct xfs_open_zone	*oz = mp->m_zone_info->zi_open_gc_zone;
570080d01c4SChristoph Hellwig 
571080d01c4SChristoph Hellwig 	if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
572080d01c4SChristoph Hellwig 		return xfs_zone_gc_select_target(mp);
573080d01c4SChristoph Hellwig 	return oz;
574080d01c4SChristoph Hellwig }
575080d01c4SChristoph Hellwig 
576080d01c4SChristoph Hellwig static unsigned int
xfs_zone_gc_scratch_available(struct xfs_zone_gc_data * data)577080d01c4SChristoph Hellwig xfs_zone_gc_scratch_available(
578080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data)
579080d01c4SChristoph Hellwig {
580080d01c4SChristoph Hellwig 	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
581080d01c4SChristoph Hellwig }
582080d01c4SChristoph Hellwig 
583080d01c4SChristoph Hellwig static bool
xfs_zone_gc_space_available(struct xfs_zone_gc_data * data)584080d01c4SChristoph Hellwig xfs_zone_gc_space_available(
585080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data)
586080d01c4SChristoph Hellwig {
587080d01c4SChristoph Hellwig 	struct xfs_open_zone	*oz;
588080d01c4SChristoph Hellwig 
589080d01c4SChristoph Hellwig 	oz = xfs_zone_gc_ensure_target(data->mp);
590080d01c4SChristoph Hellwig 	if (!oz)
591080d01c4SChristoph Hellwig 		return false;
592080d01c4SChristoph Hellwig 	return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) &&
593080d01c4SChristoph Hellwig 		xfs_zone_gc_scratch_available(data);
594080d01c4SChristoph Hellwig }
595080d01c4SChristoph Hellwig 
596080d01c4SChristoph Hellwig static void
xfs_zone_gc_end_io(struct bio * bio)597080d01c4SChristoph Hellwig xfs_zone_gc_end_io(
598080d01c4SChristoph Hellwig 	struct bio		*bio)
599080d01c4SChristoph Hellwig {
600080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk =
601080d01c4SChristoph Hellwig 		container_of(bio, struct xfs_gc_bio, bio);
602080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data = chunk->data;
603080d01c4SChristoph Hellwig 
604080d01c4SChristoph Hellwig 	WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
605080d01c4SChristoph Hellwig 	wake_up_process(data->mp->m_zone_info->zi_gc_thread);
606080d01c4SChristoph Hellwig }
607080d01c4SChristoph Hellwig 
608080d01c4SChristoph Hellwig static struct xfs_open_zone *
xfs_zone_gc_alloc_blocks(struct xfs_zone_gc_data * data,xfs_extlen_t * count_fsb,xfs_daddr_t * daddr,bool * is_seq)609080d01c4SChristoph Hellwig xfs_zone_gc_alloc_blocks(
610080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data,
611080d01c4SChristoph Hellwig 	xfs_extlen_t		*count_fsb,
612080d01c4SChristoph Hellwig 	xfs_daddr_t		*daddr,
613080d01c4SChristoph Hellwig 	bool			*is_seq)
614080d01c4SChristoph Hellwig {
615080d01c4SChristoph Hellwig 	struct xfs_mount	*mp = data->mp;
616080d01c4SChristoph Hellwig 	struct xfs_open_zone	*oz;
617080d01c4SChristoph Hellwig 
618080d01c4SChristoph Hellwig 	oz = xfs_zone_gc_ensure_target(mp);
619080d01c4SChristoph Hellwig 	if (!oz)
620080d01c4SChristoph Hellwig 		return NULL;
621080d01c4SChristoph Hellwig 
622080d01c4SChristoph Hellwig 	*count_fsb = min(*count_fsb,
623080d01c4SChristoph Hellwig 		XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
624080d01c4SChristoph Hellwig 
625080d01c4SChristoph Hellwig 	/*
626080d01c4SChristoph Hellwig 	 * Directly allocate GC blocks from the reserved pool.
627080d01c4SChristoph Hellwig 	 *
628080d01c4SChristoph Hellwig 	 * If we'd take them from the normal pool we could be stealing blocks
629080d01c4SChristoph Hellwig 	 * from a regular writer, which would then have to wait for GC and
630080d01c4SChristoph Hellwig 	 * deadlock.
631080d01c4SChristoph Hellwig 	 */
632080d01c4SChristoph Hellwig 	spin_lock(&mp->m_sb_lock);
633080d01c4SChristoph Hellwig 	*count_fsb = min(*count_fsb,
634080d01c4SChristoph Hellwig 			rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer);
635080d01c4SChristoph Hellwig 	*count_fsb = min3(*count_fsb,
636080d01c4SChristoph Hellwig 			mp->m_free[XC_FREE_RTEXTENTS].res_avail,
637080d01c4SChristoph Hellwig 			mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
638080d01c4SChristoph Hellwig 	mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
639080d01c4SChristoph Hellwig 	mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
640080d01c4SChristoph Hellwig 	spin_unlock(&mp->m_sb_lock);
641080d01c4SChristoph Hellwig 
642080d01c4SChristoph Hellwig 	if (!*count_fsb)
643080d01c4SChristoph Hellwig 		return NULL;
644080d01c4SChristoph Hellwig 
645080d01c4SChristoph Hellwig 	*daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
646080d01c4SChristoph Hellwig 	*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
647080d01c4SChristoph Hellwig 	if (!*is_seq)
648080d01c4SChristoph Hellwig 		*daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer);
649080d01c4SChristoph Hellwig 	oz->oz_write_pointer += *count_fsb;
650080d01c4SChristoph Hellwig 	atomic_inc(&oz->oz_ref);
651080d01c4SChristoph Hellwig 	return oz;
652080d01c4SChristoph Hellwig }
653080d01c4SChristoph Hellwig 
654080d01c4SChristoph Hellwig static bool
xfs_zone_gc_start_chunk(struct xfs_zone_gc_data * data)655080d01c4SChristoph Hellwig xfs_zone_gc_start_chunk(
656080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data)
657080d01c4SChristoph Hellwig {
658080d01c4SChristoph Hellwig 	struct xfs_zone_gc_iter	*iter = &data->iter;
659080d01c4SChristoph Hellwig 	struct xfs_mount	*mp = data->mp;
660080d01c4SChristoph Hellwig 	struct block_device	*bdev = mp->m_rtdev_targp->bt_bdev;
661080d01c4SChristoph Hellwig 	struct xfs_open_zone	*oz;
662080d01c4SChristoph Hellwig 	struct xfs_rmap_irec	irec;
663080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk;
664080d01c4SChristoph Hellwig 	struct xfs_inode	*ip;
665080d01c4SChristoph Hellwig 	struct bio		*bio;
666080d01c4SChristoph Hellwig 	xfs_daddr_t		daddr;
667080d01c4SChristoph Hellwig 	bool			is_seq;
668080d01c4SChristoph Hellwig 
669080d01c4SChristoph Hellwig 	if (xfs_is_shutdown(mp))
670080d01c4SChristoph Hellwig 		return false;
671080d01c4SChristoph Hellwig 
672080d01c4SChristoph Hellwig 	if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
673080d01c4SChristoph Hellwig 		return false;
674080d01c4SChristoph Hellwig 	oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
675080d01c4SChristoph Hellwig 			&is_seq);
676080d01c4SChristoph Hellwig 	if (!oz) {
677080d01c4SChristoph Hellwig 		xfs_irele(ip);
678080d01c4SChristoph Hellwig 		return false;
679080d01c4SChristoph Hellwig 	}
680080d01c4SChristoph Hellwig 
681080d01c4SChristoph Hellwig 	bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
682080d01c4SChristoph Hellwig 
683080d01c4SChristoph Hellwig 	chunk = container_of(bio, struct xfs_gc_bio, bio);
684080d01c4SChristoph Hellwig 	chunk->ip = ip;
685080d01c4SChristoph Hellwig 	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
686080d01c4SChristoph Hellwig 	chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
687080d01c4SChristoph Hellwig 	chunk->old_startblock =
688080d01c4SChristoph Hellwig 		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
689080d01c4SChristoph Hellwig 	chunk->new_daddr = daddr;
690080d01c4SChristoph Hellwig 	chunk->is_seq = is_seq;
691080d01c4SChristoph Hellwig 	chunk->scratch = &data->scratch[data->scratch_idx];
692080d01c4SChristoph Hellwig 	chunk->data = data;
693080d01c4SChristoph Hellwig 	chunk->oz = oz;
694080d01c4SChristoph Hellwig 
695080d01c4SChristoph Hellwig 	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
696080d01c4SChristoph Hellwig 	bio->bi_end_io = xfs_zone_gc_end_io;
697080d01c4SChristoph Hellwig 	bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
698080d01c4SChristoph Hellwig 			chunk->scratch->offset);
699080d01c4SChristoph Hellwig 	chunk->scratch->offset += chunk->len;
700080d01c4SChristoph Hellwig 	if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
701080d01c4SChristoph Hellwig 		data->scratch_idx =
702080d01c4SChristoph Hellwig 			(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
703080d01c4SChristoph Hellwig 	}
704080d01c4SChristoph Hellwig 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
705080d01c4SChristoph Hellwig 	list_add_tail(&chunk->entry, &data->reading);
706080d01c4SChristoph Hellwig 	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
707080d01c4SChristoph Hellwig 
708080d01c4SChristoph Hellwig 	submit_bio(bio);
709080d01c4SChristoph Hellwig 	return true;
710080d01c4SChristoph Hellwig }
711080d01c4SChristoph Hellwig 
712080d01c4SChristoph Hellwig static void
xfs_zone_gc_free_chunk(struct xfs_gc_bio * chunk)713080d01c4SChristoph Hellwig xfs_zone_gc_free_chunk(
714080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk)
715080d01c4SChristoph Hellwig {
716080d01c4SChristoph Hellwig 	list_del(&chunk->entry);
717080d01c4SChristoph Hellwig 	xfs_open_zone_put(chunk->oz);
718080d01c4SChristoph Hellwig 	xfs_irele(chunk->ip);
719080d01c4SChristoph Hellwig 	bio_put(&chunk->bio);
720080d01c4SChristoph Hellwig }
721080d01c4SChristoph Hellwig 
722080d01c4SChristoph Hellwig static void
xfs_zone_gc_submit_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)723080d01c4SChristoph Hellwig xfs_zone_gc_submit_write(
724080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data,
725080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk)
726080d01c4SChristoph Hellwig {
727080d01c4SChristoph Hellwig 	if (chunk->is_seq) {
728080d01c4SChristoph Hellwig 		chunk->bio.bi_opf &= ~REQ_OP_WRITE;
729080d01c4SChristoph Hellwig 		chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
730080d01c4SChristoph Hellwig 	}
731080d01c4SChristoph Hellwig 	chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
732080d01c4SChristoph Hellwig 	chunk->bio.bi_end_io = xfs_zone_gc_end_io;
733080d01c4SChristoph Hellwig 	submit_bio(&chunk->bio);
734080d01c4SChristoph Hellwig }
735080d01c4SChristoph Hellwig 
736080d01c4SChristoph Hellwig static struct xfs_gc_bio *
xfs_zone_gc_split_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)737080d01c4SChristoph Hellwig xfs_zone_gc_split_write(
738080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data,
739080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk)
740080d01c4SChristoph Hellwig {
741080d01c4SChristoph Hellwig 	struct queue_limits	*lim =
742080d01c4SChristoph Hellwig 		&bdev_get_queue(chunk->bio.bi_bdev)->limits;
743080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*split_chunk;
744080d01c4SChristoph Hellwig 	int			split_sectors;
745080d01c4SChristoph Hellwig 	unsigned int		split_len;
746080d01c4SChristoph Hellwig 	struct bio		*split;
747080d01c4SChristoph Hellwig 	unsigned int		nsegs;
748080d01c4SChristoph Hellwig 
749080d01c4SChristoph Hellwig 	if (!chunk->is_seq)
750080d01c4SChristoph Hellwig 		return NULL;
751080d01c4SChristoph Hellwig 
752080d01c4SChristoph Hellwig 	split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
753080d01c4SChristoph Hellwig 			lim->max_zone_append_sectors << SECTOR_SHIFT);
754080d01c4SChristoph Hellwig 	if (!split_sectors)
755080d01c4SChristoph Hellwig 		return NULL;
756080d01c4SChristoph Hellwig 
757080d01c4SChristoph Hellwig 	/* ensure the split chunk is still block size aligned */
758080d01c4SChristoph Hellwig 	split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
759080d01c4SChristoph Hellwig 			data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
760080d01c4SChristoph Hellwig 	split_len = split_sectors << SECTOR_SHIFT;
761080d01c4SChristoph Hellwig 
762080d01c4SChristoph Hellwig 	split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
763080d01c4SChristoph Hellwig 	split_chunk = container_of(split, struct xfs_gc_bio, bio);
764080d01c4SChristoph Hellwig 	split_chunk->data = data;
765080d01c4SChristoph Hellwig 	ihold(VFS_I(chunk->ip));
766080d01c4SChristoph Hellwig 	split_chunk->ip = chunk->ip;
767080d01c4SChristoph Hellwig 	split_chunk->is_seq = chunk->is_seq;
768080d01c4SChristoph Hellwig 	split_chunk->scratch = chunk->scratch;
769080d01c4SChristoph Hellwig 	split_chunk->offset = chunk->offset;
770080d01c4SChristoph Hellwig 	split_chunk->len = split_len;
771080d01c4SChristoph Hellwig 	split_chunk->old_startblock = chunk->old_startblock;
772080d01c4SChristoph Hellwig 	split_chunk->new_daddr = chunk->new_daddr;
773080d01c4SChristoph Hellwig 	split_chunk->oz = chunk->oz;
774080d01c4SChristoph Hellwig 	atomic_inc(&chunk->oz->oz_ref);
775080d01c4SChristoph Hellwig 
776080d01c4SChristoph Hellwig 	chunk->offset += split_len;
777080d01c4SChristoph Hellwig 	chunk->len -= split_len;
778080d01c4SChristoph Hellwig 	chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
779080d01c4SChristoph Hellwig 
780080d01c4SChristoph Hellwig 	/* add right before the original chunk */
781080d01c4SChristoph Hellwig 	WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
782080d01c4SChristoph Hellwig 	list_add_tail(&split_chunk->entry, &chunk->entry);
783080d01c4SChristoph Hellwig 	return split_chunk;
784080d01c4SChristoph Hellwig }
785080d01c4SChristoph Hellwig 
786080d01c4SChristoph Hellwig static void
xfs_zone_gc_write_chunk(struct xfs_gc_bio * chunk)787080d01c4SChristoph Hellwig xfs_zone_gc_write_chunk(
788080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk)
789080d01c4SChristoph Hellwig {
790080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data = chunk->data;
791080d01c4SChristoph Hellwig 	struct xfs_mount	*mp = chunk->ip->i_mount;
792080d01c4SChristoph Hellwig 	unsigned int		folio_offset = chunk->bio.bi_io_vec->bv_offset;
793080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*split_chunk;
794080d01c4SChristoph Hellwig 
795080d01c4SChristoph Hellwig 	if (chunk->bio.bi_status)
796080d01c4SChristoph Hellwig 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
797080d01c4SChristoph Hellwig 	if (xfs_is_shutdown(mp)) {
798080d01c4SChristoph Hellwig 		xfs_zone_gc_free_chunk(chunk);
799080d01c4SChristoph Hellwig 		return;
800080d01c4SChristoph Hellwig 	}
801080d01c4SChristoph Hellwig 
802080d01c4SChristoph Hellwig 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
803080d01c4SChristoph Hellwig 	list_move_tail(&chunk->entry, &data->writing);
804080d01c4SChristoph Hellwig 
805080d01c4SChristoph Hellwig 	bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
806080d01c4SChristoph Hellwig 	bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
807080d01c4SChristoph Hellwig 			folio_offset);
808080d01c4SChristoph Hellwig 
809080d01c4SChristoph Hellwig 	while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
810080d01c4SChristoph Hellwig 		xfs_zone_gc_submit_write(data, split_chunk);
811080d01c4SChristoph Hellwig 	xfs_zone_gc_submit_write(data, chunk);
812080d01c4SChristoph Hellwig }
813080d01c4SChristoph Hellwig 
814080d01c4SChristoph Hellwig static void
xfs_zone_gc_finish_chunk(struct xfs_gc_bio * chunk)815080d01c4SChristoph Hellwig xfs_zone_gc_finish_chunk(
816080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk)
817080d01c4SChristoph Hellwig {
818080d01c4SChristoph Hellwig 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
819080d01c4SChristoph Hellwig 	struct xfs_inode	*ip = chunk->ip;
820080d01c4SChristoph Hellwig 	struct xfs_mount	*mp = ip->i_mount;
821080d01c4SChristoph Hellwig 	int			error;
822080d01c4SChristoph Hellwig 
823080d01c4SChristoph Hellwig 	if (chunk->bio.bi_status)
824080d01c4SChristoph Hellwig 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
825080d01c4SChristoph Hellwig 	if (xfs_is_shutdown(mp)) {
826080d01c4SChristoph Hellwig 		xfs_zone_gc_free_chunk(chunk);
827080d01c4SChristoph Hellwig 		return;
828080d01c4SChristoph Hellwig 	}
829080d01c4SChristoph Hellwig 
830080d01c4SChristoph Hellwig 	chunk->scratch->freed += chunk->len;
831080d01c4SChristoph Hellwig 	if (chunk->scratch->freed == chunk->scratch->offset) {
832080d01c4SChristoph Hellwig 		chunk->scratch->offset = 0;
833080d01c4SChristoph Hellwig 		chunk->scratch->freed = 0;
834080d01c4SChristoph Hellwig 	}
835080d01c4SChristoph Hellwig 
836080d01c4SChristoph Hellwig 	/*
837080d01c4SChristoph Hellwig 	 * Cycle through the iolock and wait for direct I/O and layouts to
838080d01c4SChristoph Hellwig 	 * ensure no one is reading from the old mapping before it goes away.
839080d01c4SChristoph Hellwig 	 *
840080d01c4SChristoph Hellwig 	 * Note that xfs_zoned_end_io() below checks that no other writer raced
841080d01c4SChristoph Hellwig 	 * with us to update the mapping by checking that the old startblock
842080d01c4SChristoph Hellwig 	 * didn't change.
843080d01c4SChristoph Hellwig 	 */
844080d01c4SChristoph Hellwig 	xfs_ilock(ip, iolock);
845080d01c4SChristoph Hellwig 	error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
846080d01c4SChristoph Hellwig 	if (!error)
847080d01c4SChristoph Hellwig 		inode_dio_wait(VFS_I(ip));
848080d01c4SChristoph Hellwig 	xfs_iunlock(ip, iolock);
849080d01c4SChristoph Hellwig 	if (error)
850080d01c4SChristoph Hellwig 		goto free;
851080d01c4SChristoph Hellwig 
852080d01c4SChristoph Hellwig 	if (chunk->is_seq)
853080d01c4SChristoph Hellwig 		chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
854080d01c4SChristoph Hellwig 	error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
855080d01c4SChristoph Hellwig 			chunk->new_daddr, chunk->oz, chunk->old_startblock);
856080d01c4SChristoph Hellwig free:
857080d01c4SChristoph Hellwig 	if (error)
858080d01c4SChristoph Hellwig 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
859080d01c4SChristoph Hellwig 	xfs_zone_gc_free_chunk(chunk);
860080d01c4SChristoph Hellwig }
861080d01c4SChristoph Hellwig 
862080d01c4SChristoph Hellwig static void
xfs_zone_gc_finish_reset(struct xfs_gc_bio * chunk)863080d01c4SChristoph Hellwig xfs_zone_gc_finish_reset(
864080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk)
865080d01c4SChristoph Hellwig {
866080d01c4SChristoph Hellwig 	struct xfs_rtgroup	*rtg = chunk->bio.bi_private;
867080d01c4SChristoph Hellwig 	struct xfs_mount	*mp = rtg_mount(rtg);
868080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi = mp->m_zone_info;
869080d01c4SChristoph Hellwig 
870080d01c4SChristoph Hellwig 	if (chunk->bio.bi_status) {
871080d01c4SChristoph Hellwig 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
872080d01c4SChristoph Hellwig 		goto out;
873080d01c4SChristoph Hellwig 	}
874080d01c4SChristoph Hellwig 
875080d01c4SChristoph Hellwig 	xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
876080d01c4SChristoph Hellwig 	atomic_inc(&zi->zi_nr_free_zones);
877080d01c4SChristoph Hellwig 
878080d01c4SChristoph Hellwig 	xfs_zoned_add_available(mp, rtg_blocks(rtg));
879080d01c4SChristoph Hellwig 
880080d01c4SChristoph Hellwig 	wake_up_all(&zi->zi_zone_wait);
881080d01c4SChristoph Hellwig out:
882080d01c4SChristoph Hellwig 	list_del(&chunk->entry);
883080d01c4SChristoph Hellwig 	bio_put(&chunk->bio);
884080d01c4SChristoph Hellwig }
885080d01c4SChristoph Hellwig 
886080d01c4SChristoph Hellwig static bool
xfs_zone_gc_prepare_reset(struct bio * bio,struct xfs_rtgroup * rtg)887080d01c4SChristoph Hellwig xfs_zone_gc_prepare_reset(
888080d01c4SChristoph Hellwig 	struct bio		*bio,
889080d01c4SChristoph Hellwig 	struct xfs_rtgroup	*rtg)
890080d01c4SChristoph Hellwig {
891080d01c4SChristoph Hellwig 	trace_xfs_zone_reset(rtg);
892080d01c4SChristoph Hellwig 
893080d01c4SChristoph Hellwig 	ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
894080d01c4SChristoph Hellwig 	bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
895080d01c4SChristoph Hellwig 	if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
896080d01c4SChristoph Hellwig 		if (!bdev_max_discard_sectors(bio->bi_bdev))
897080d01c4SChristoph Hellwig 			return false;
898080d01c4SChristoph Hellwig 		bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
899080d01c4SChristoph Hellwig 		bio->bi_iter.bi_size =
900080d01c4SChristoph Hellwig 			XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
901080d01c4SChristoph Hellwig 	}
902080d01c4SChristoph Hellwig 
903080d01c4SChristoph Hellwig 	return true;
904080d01c4SChristoph Hellwig }
905080d01c4SChristoph Hellwig 
906080d01c4SChristoph Hellwig int
xfs_zone_gc_reset_sync(struct xfs_rtgroup * rtg)907080d01c4SChristoph Hellwig xfs_zone_gc_reset_sync(
908080d01c4SChristoph Hellwig 	struct xfs_rtgroup	*rtg)
909080d01c4SChristoph Hellwig {
910080d01c4SChristoph Hellwig 	int			error = 0;
911080d01c4SChristoph Hellwig 	struct bio		bio;
912080d01c4SChristoph Hellwig 
913080d01c4SChristoph Hellwig 	bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
914080d01c4SChristoph Hellwig 			REQ_OP_ZONE_RESET);
915080d01c4SChristoph Hellwig 	if (xfs_zone_gc_prepare_reset(&bio, rtg))
916080d01c4SChristoph Hellwig 		error = submit_bio_wait(&bio);
917080d01c4SChristoph Hellwig 	bio_uninit(&bio);
918080d01c4SChristoph Hellwig 
919080d01c4SChristoph Hellwig 	return error;
920080d01c4SChristoph Hellwig }
921080d01c4SChristoph Hellwig 
922080d01c4SChristoph Hellwig static void
xfs_zone_gc_reset_zones(struct xfs_zone_gc_data * data,struct xfs_group * reset_list)923080d01c4SChristoph Hellwig xfs_zone_gc_reset_zones(
924080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data,
925080d01c4SChristoph Hellwig 	struct xfs_group	*reset_list)
926080d01c4SChristoph Hellwig {
927080d01c4SChristoph Hellwig 	struct xfs_group	*next = reset_list;
928080d01c4SChristoph Hellwig 
929080d01c4SChristoph Hellwig 	if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
930080d01c4SChristoph Hellwig 		xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
931080d01c4SChristoph Hellwig 		return;
932080d01c4SChristoph Hellwig 	}
933080d01c4SChristoph Hellwig 
934080d01c4SChristoph Hellwig 	do {
935080d01c4SChristoph Hellwig 		struct xfs_rtgroup	*rtg = to_rtg(next);
936080d01c4SChristoph Hellwig 		struct xfs_gc_bio	*chunk;
937080d01c4SChristoph Hellwig 		struct bio		*bio;
938080d01c4SChristoph Hellwig 
939080d01c4SChristoph Hellwig 		xfs_log_force_inode(rtg_rmap(rtg));
940080d01c4SChristoph Hellwig 
941080d01c4SChristoph Hellwig 		next = rtg_group(rtg)->xg_next_reset;
942080d01c4SChristoph Hellwig 		rtg_group(rtg)->xg_next_reset = NULL;
943080d01c4SChristoph Hellwig 
944080d01c4SChristoph Hellwig 		bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
945080d01c4SChristoph Hellwig 				0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
946080d01c4SChristoph Hellwig 		bio->bi_private = rtg;
947080d01c4SChristoph Hellwig 		bio->bi_end_io = xfs_zone_gc_end_io;
948080d01c4SChristoph Hellwig 
949080d01c4SChristoph Hellwig 		chunk = container_of(bio, struct xfs_gc_bio, bio);
950080d01c4SChristoph Hellwig 		chunk->data = data;
951080d01c4SChristoph Hellwig 		WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
952080d01c4SChristoph Hellwig 		list_add_tail(&chunk->entry, &data->resetting);
953080d01c4SChristoph Hellwig 
954080d01c4SChristoph Hellwig 		/*
955080d01c4SChristoph Hellwig 		 * Also use the bio to drive the state machine when neither
956080d01c4SChristoph Hellwig 		 * zone reset nor discard is supported to keep things simple.
957080d01c4SChristoph Hellwig 		 */
958080d01c4SChristoph Hellwig 		if (xfs_zone_gc_prepare_reset(bio, rtg))
959080d01c4SChristoph Hellwig 			submit_bio(bio);
960080d01c4SChristoph Hellwig 		else
961080d01c4SChristoph Hellwig 			bio_endio(bio);
962080d01c4SChristoph Hellwig 	} while (next);
963080d01c4SChristoph Hellwig }
964080d01c4SChristoph Hellwig 
965080d01c4SChristoph Hellwig /*
966080d01c4SChristoph Hellwig  * Handle the work to read and write data for GC and to reset the zones,
967080d01c4SChristoph Hellwig  * including handling all completions.
968080d01c4SChristoph Hellwig  *
969080d01c4SChristoph Hellwig  * Note that the order of the chunks is preserved so that we don't undo the
970080d01c4SChristoph Hellwig  * optimal order established by xfs_zone_gc_query().
971080d01c4SChristoph Hellwig  */
972080d01c4SChristoph Hellwig static bool
xfs_zone_gc_handle_work(struct xfs_zone_gc_data * data)973080d01c4SChristoph Hellwig xfs_zone_gc_handle_work(
974080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data)
975080d01c4SChristoph Hellwig {
976080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi = data->mp->m_zone_info;
977080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk, *next;
978080d01c4SChristoph Hellwig 	struct xfs_group	*reset_list;
979080d01c4SChristoph Hellwig 	struct blk_plug		plug;
980080d01c4SChristoph Hellwig 
981080d01c4SChristoph Hellwig 	spin_lock(&zi->zi_reset_list_lock);
982080d01c4SChristoph Hellwig 	reset_list = zi->zi_reset_list;
983080d01c4SChristoph Hellwig 	zi->zi_reset_list = NULL;
984080d01c4SChristoph Hellwig 	spin_unlock(&zi->zi_reset_list_lock);
985080d01c4SChristoph Hellwig 
986080d01c4SChristoph Hellwig 	if (!xfs_zone_gc_select_victim(data) ||
987080d01c4SChristoph Hellwig 	    !xfs_zone_gc_space_available(data)) {
988080d01c4SChristoph Hellwig 		if (list_empty(&data->reading) &&
989080d01c4SChristoph Hellwig 		    list_empty(&data->writing) &&
990080d01c4SChristoph Hellwig 		    list_empty(&data->resetting) &&
991080d01c4SChristoph Hellwig 		    !reset_list)
992080d01c4SChristoph Hellwig 			return false;
993080d01c4SChristoph Hellwig 	}
994080d01c4SChristoph Hellwig 
995080d01c4SChristoph Hellwig 	__set_current_state(TASK_RUNNING);
996080d01c4SChristoph Hellwig 	try_to_freeze();
997080d01c4SChristoph Hellwig 
998080d01c4SChristoph Hellwig 	if (reset_list)
999080d01c4SChristoph Hellwig 		xfs_zone_gc_reset_zones(data, reset_list);
1000080d01c4SChristoph Hellwig 
1001080d01c4SChristoph Hellwig 	list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
1002080d01c4SChristoph Hellwig 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1003080d01c4SChristoph Hellwig 			break;
1004080d01c4SChristoph Hellwig 		xfs_zone_gc_finish_reset(chunk);
1005080d01c4SChristoph Hellwig 	}
1006080d01c4SChristoph Hellwig 
1007080d01c4SChristoph Hellwig 	list_for_each_entry_safe(chunk, next, &data->writing, entry) {
1008080d01c4SChristoph Hellwig 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1009080d01c4SChristoph Hellwig 			break;
1010080d01c4SChristoph Hellwig 		xfs_zone_gc_finish_chunk(chunk);
1011080d01c4SChristoph Hellwig 	}
1012080d01c4SChristoph Hellwig 
1013080d01c4SChristoph Hellwig 	blk_start_plug(&plug);
1014080d01c4SChristoph Hellwig 	list_for_each_entry_safe(chunk, next, &data->reading, entry) {
1015080d01c4SChristoph Hellwig 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1016080d01c4SChristoph Hellwig 			break;
1017080d01c4SChristoph Hellwig 		xfs_zone_gc_write_chunk(chunk);
1018080d01c4SChristoph Hellwig 	}
1019080d01c4SChristoph Hellwig 	blk_finish_plug(&plug);
1020080d01c4SChristoph Hellwig 
1021080d01c4SChristoph Hellwig 	blk_start_plug(&plug);
1022080d01c4SChristoph Hellwig 	while (xfs_zone_gc_start_chunk(data))
1023080d01c4SChristoph Hellwig 		;
1024080d01c4SChristoph Hellwig 	blk_finish_plug(&plug);
1025080d01c4SChristoph Hellwig 	return true;
1026080d01c4SChristoph Hellwig }
1027080d01c4SChristoph Hellwig 
1028080d01c4SChristoph Hellwig /*
1029080d01c4SChristoph Hellwig  * Note that the current GC algorithm would break reflinks and thus duplicate
1030080d01c4SChristoph Hellwig  * data that was shared by multiple owners before.  Because of that reflinks
1031080d01c4SChristoph Hellwig  * are currently not supported on zoned file systems and can't be created or
1032080d01c4SChristoph Hellwig  * mounted.
1033080d01c4SChristoph Hellwig  */
1034080d01c4SChristoph Hellwig static int
xfs_zoned_gcd(void * private)1035080d01c4SChristoph Hellwig xfs_zoned_gcd(
1036080d01c4SChristoph Hellwig 	void			*private)
1037080d01c4SChristoph Hellwig {
1038080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data = private;
1039080d01c4SChristoph Hellwig 	struct xfs_mount	*mp = data->mp;
1040080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi = mp->m_zone_info;
1041080d01c4SChristoph Hellwig 	unsigned int		nofs_flag;
1042080d01c4SChristoph Hellwig 
1043080d01c4SChristoph Hellwig 	nofs_flag = memalloc_nofs_save();
1044080d01c4SChristoph Hellwig 	set_freezable();
1045080d01c4SChristoph Hellwig 
1046080d01c4SChristoph Hellwig 	for (;;) {
1047080d01c4SChristoph Hellwig 		set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1048080d01c4SChristoph Hellwig 		xfs_set_zonegc_running(mp);
1049080d01c4SChristoph Hellwig 		if (xfs_zone_gc_handle_work(data))
1050080d01c4SChristoph Hellwig 			continue;
1051080d01c4SChristoph Hellwig 
1052080d01c4SChristoph Hellwig 		if (list_empty(&data->reading) &&
1053080d01c4SChristoph Hellwig 		    list_empty(&data->writing) &&
1054080d01c4SChristoph Hellwig 		    list_empty(&data->resetting) &&
1055080d01c4SChristoph Hellwig 		    !zi->zi_reset_list) {
1056080d01c4SChristoph Hellwig 			xfs_clear_zonegc_running(mp);
1057080d01c4SChristoph Hellwig 			xfs_zoned_resv_wake_all(mp);
1058080d01c4SChristoph Hellwig 
1059080d01c4SChristoph Hellwig 			if (kthread_should_stop()) {
1060080d01c4SChristoph Hellwig 				__set_current_state(TASK_RUNNING);
1061080d01c4SChristoph Hellwig 				break;
1062080d01c4SChristoph Hellwig 			}
1063080d01c4SChristoph Hellwig 
1064080d01c4SChristoph Hellwig 			if (kthread_should_park()) {
1065080d01c4SChristoph Hellwig 				__set_current_state(TASK_RUNNING);
1066080d01c4SChristoph Hellwig 				kthread_parkme();
1067080d01c4SChristoph Hellwig 				continue;
1068080d01c4SChristoph Hellwig 			}
1069080d01c4SChristoph Hellwig 		}
1070080d01c4SChristoph Hellwig 
1071080d01c4SChristoph Hellwig 		schedule();
1072080d01c4SChristoph Hellwig 	}
1073080d01c4SChristoph Hellwig 	xfs_clear_zonegc_running(mp);
1074080d01c4SChristoph Hellwig 
1075080d01c4SChristoph Hellwig 	if (data->iter.victim_rtg)
1076080d01c4SChristoph Hellwig 		xfs_rtgroup_rele(data->iter.victim_rtg);
1077080d01c4SChristoph Hellwig 
1078080d01c4SChristoph Hellwig 	memalloc_nofs_restore(nofs_flag);
1079080d01c4SChristoph Hellwig 	xfs_zone_gc_data_free(data);
1080080d01c4SChristoph Hellwig 	return 0;
1081080d01c4SChristoph Hellwig }
1082080d01c4SChristoph Hellwig 
1083080d01c4SChristoph Hellwig void
xfs_zone_gc_start(struct xfs_mount * mp)1084080d01c4SChristoph Hellwig xfs_zone_gc_start(
1085080d01c4SChristoph Hellwig 	struct xfs_mount	*mp)
1086080d01c4SChristoph Hellwig {
1087080d01c4SChristoph Hellwig 	if (xfs_has_zoned(mp))
1088080d01c4SChristoph Hellwig 		kthread_unpark(mp->m_zone_info->zi_gc_thread);
1089080d01c4SChristoph Hellwig }
1090080d01c4SChristoph Hellwig 
1091080d01c4SChristoph Hellwig void
xfs_zone_gc_stop(struct xfs_mount * mp)1092080d01c4SChristoph Hellwig xfs_zone_gc_stop(
1093080d01c4SChristoph Hellwig 	struct xfs_mount	*mp)
1094080d01c4SChristoph Hellwig {
1095080d01c4SChristoph Hellwig 	if (xfs_has_zoned(mp))
1096080d01c4SChristoph Hellwig 		kthread_park(mp->m_zone_info->zi_gc_thread);
1097080d01c4SChristoph Hellwig }
1098080d01c4SChristoph Hellwig 
1099080d01c4SChristoph Hellwig int
xfs_zone_gc_mount(struct xfs_mount * mp)1100080d01c4SChristoph Hellwig xfs_zone_gc_mount(
1101080d01c4SChristoph Hellwig 	struct xfs_mount	*mp)
1102080d01c4SChristoph Hellwig {
1103080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi = mp->m_zone_info;
1104080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data;
1105080d01c4SChristoph Hellwig 	struct xfs_open_zone	*oz;
1106080d01c4SChristoph Hellwig 	int			error;
1107080d01c4SChristoph Hellwig 
1108080d01c4SChristoph Hellwig 	/*
1109080d01c4SChristoph Hellwig 	 * If there are no free zones available for GC, pick the open zone with
1110080d01c4SChristoph Hellwig 	 * the least used space to GC into.  This should only happen after an
1111080d01c4SChristoph Hellwig 	 * unclean shutdown near ENOSPC while GC was ongoing.
1112080d01c4SChristoph Hellwig 	 *
1113080d01c4SChristoph Hellwig 	 * We also need to do this for the first gc zone allocation if we
1114080d01c4SChristoph Hellwig 	 * unmounted while at the open limit.
1115080d01c4SChristoph Hellwig 	 */
1116080d01c4SChristoph Hellwig 	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
1117080d01c4SChristoph Hellwig 	    zi->zi_nr_open_zones == mp->m_max_open_zones)
1118080d01c4SChristoph Hellwig 		oz = xfs_zone_gc_steal_open(zi);
1119080d01c4SChristoph Hellwig 	else
1120*64d03611SHans Holmberg 		oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
1121080d01c4SChristoph Hellwig 	if (!oz) {
1122080d01c4SChristoph Hellwig 		xfs_warn(mp, "unable to allocate a zone for gc");
1123080d01c4SChristoph Hellwig 		error = -EIO;
1124080d01c4SChristoph Hellwig 		goto out;
1125080d01c4SChristoph Hellwig 	}
1126080d01c4SChristoph Hellwig 
1127080d01c4SChristoph Hellwig 	trace_xfs_zone_gc_target_opened(oz->oz_rtg);
1128080d01c4SChristoph Hellwig 	zi->zi_open_gc_zone = oz;
1129080d01c4SChristoph Hellwig 
1130080d01c4SChristoph Hellwig 	data = xfs_zone_gc_data_alloc(mp);
1131080d01c4SChristoph Hellwig 	if (!data) {
1132080d01c4SChristoph Hellwig 		error = -ENOMEM;
1133080d01c4SChristoph Hellwig 		goto out_put_gc_zone;
1134080d01c4SChristoph Hellwig 	}
1135080d01c4SChristoph Hellwig 
1136080d01c4SChristoph Hellwig 	mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
1137080d01c4SChristoph Hellwig 			"xfs-zone-gc/%s", mp->m_super->s_id);
1138080d01c4SChristoph Hellwig 	if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
1139080d01c4SChristoph Hellwig 		xfs_warn(mp, "unable to create zone gc thread");
1140080d01c4SChristoph Hellwig 		error = PTR_ERR(mp->m_zone_info->zi_gc_thread);
1141080d01c4SChristoph Hellwig 		goto out_free_gc_data;
1142080d01c4SChristoph Hellwig 	}
1143080d01c4SChristoph Hellwig 
1144080d01c4SChristoph Hellwig 	/* xfs_zone_gc_start will unpark for rw mounts */
1145080d01c4SChristoph Hellwig 	kthread_park(mp->m_zone_info->zi_gc_thread);
1146080d01c4SChristoph Hellwig 	return 0;
1147080d01c4SChristoph Hellwig 
1148080d01c4SChristoph Hellwig out_free_gc_data:
1149080d01c4SChristoph Hellwig 	kfree(data);
1150080d01c4SChristoph Hellwig out_put_gc_zone:
1151080d01c4SChristoph Hellwig 	xfs_open_zone_put(zi->zi_open_gc_zone);
1152080d01c4SChristoph Hellwig out:
1153080d01c4SChristoph Hellwig 	return error;
1154080d01c4SChristoph Hellwig }
1155080d01c4SChristoph Hellwig 
1156080d01c4SChristoph Hellwig void
xfs_zone_gc_unmount(struct xfs_mount * mp)1157080d01c4SChristoph Hellwig xfs_zone_gc_unmount(
1158080d01c4SChristoph Hellwig 	struct xfs_mount	*mp)
1159080d01c4SChristoph Hellwig {
1160080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi = mp->m_zone_info;
1161080d01c4SChristoph Hellwig 
1162080d01c4SChristoph Hellwig 	kthread_stop(zi->zi_gc_thread);
1163080d01c4SChristoph Hellwig 	if (zi->zi_open_gc_zone)
1164080d01c4SChristoph Hellwig 		xfs_open_zone_put(zi->zi_open_gc_zone);
1165080d01c4SChristoph Hellwig }
1166