xref: /linux/fs/xfs/xfs_zone_gc.c (revision 080d01c41d44f0993f2c235a6bfdb681f0a66be6)
1*080d01c4SChristoph Hellwig // SPDX-License-Identifier: GPL-2.0
2*080d01c4SChristoph Hellwig /*
3*080d01c4SChristoph Hellwig  * Copyright (c) 2023-2025 Christoph Hellwig.
4*080d01c4SChristoph Hellwig  * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5*080d01c4SChristoph Hellwig  */
6*080d01c4SChristoph Hellwig #include "xfs.h"
7*080d01c4SChristoph Hellwig #include "xfs_shared.h"
8*080d01c4SChristoph Hellwig #include "xfs_format.h"
9*080d01c4SChristoph Hellwig #include "xfs_log_format.h"
10*080d01c4SChristoph Hellwig #include "xfs_trans_resv.h"
11*080d01c4SChristoph Hellwig #include "xfs_mount.h"
12*080d01c4SChristoph Hellwig #include "xfs_inode.h"
13*080d01c4SChristoph Hellwig #include "xfs_btree.h"
14*080d01c4SChristoph Hellwig #include "xfs_trans.h"
15*080d01c4SChristoph Hellwig #include "xfs_icache.h"
16*080d01c4SChristoph Hellwig #include "xfs_rmap.h"
17*080d01c4SChristoph Hellwig #include "xfs_rtbitmap.h"
18*080d01c4SChristoph Hellwig #include "xfs_rtrmap_btree.h"
19*080d01c4SChristoph Hellwig #include "xfs_zone_alloc.h"
20*080d01c4SChristoph Hellwig #include "xfs_zone_priv.h"
21*080d01c4SChristoph Hellwig #include "xfs_zones.h"
22*080d01c4SChristoph Hellwig #include "xfs_trace.h"
23*080d01c4SChristoph Hellwig 
24*080d01c4SChristoph Hellwig /*
25*080d01c4SChristoph Hellwig  * Implement Garbage Collection (GC) of partially used zoned.
26*080d01c4SChristoph Hellwig  *
27*080d01c4SChristoph Hellwig  * To support the purely sequential writes in each zone, zoned XFS needs to be
28*080d01c4SChristoph Hellwig  * able to move data remaining in a zone out of it to reset the zone to prepare
29*080d01c4SChristoph Hellwig  * for writing to it again.
30*080d01c4SChristoph Hellwig  *
31*080d01c4SChristoph Hellwig  * This is done by the GC thread implemented in this file.  To support that a
32*080d01c4SChristoph Hellwig  * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
33*080d01c4SChristoph Hellwig  * write the garbage collected data into.
34*080d01c4SChristoph Hellwig  *
35*080d01c4SChristoph Hellwig  * Whenever the available space is below the chosen threshold, the GC thread
36*080d01c4SChristoph Hellwig  * looks for potential non-empty but not fully used zones that are worth
37*080d01c4SChristoph Hellwig  * reclaiming.  Once found the rmap for the victim zone is queried, and after
38*080d01c4SChristoph Hellwig  * a bit of sorting to reduce fragmentation, the still live extents are read
39*080d01c4SChristoph Hellwig  * into memory and written to the GC target zone, and the bmap btree of the
40*080d01c4SChristoph Hellwig  * files is updated to point to the new location.  To avoid taking the IOLOCK
41*080d01c4SChristoph Hellwig  * and MMAPLOCK for the entire GC process and thus affecting the latency of
42*080d01c4SChristoph Hellwig  * user reads and writes to the files, the GC writes are speculative and the
43*080d01c4SChristoph Hellwig  * I/O completion checks that no other writes happened for the affected regions
44*080d01c4SChristoph Hellwig  * before remapping.
45*080d01c4SChristoph Hellwig  *
46*080d01c4SChristoph Hellwig  * Once a zone does not contain any valid data, be that through GC or user
47*080d01c4SChristoph Hellwig  * block removal, it is queued for for a zone reset.  The reset operation
48*080d01c4SChristoph Hellwig  * carefully ensures that the RT device cache is flushed and all transactions
49*080d01c4SChristoph Hellwig  * referencing the rmap have been committed to disk.
50*080d01c4SChristoph Hellwig  */
51*080d01c4SChristoph Hellwig 
52*080d01c4SChristoph Hellwig /*
53*080d01c4SChristoph Hellwig  * Size of each GC scratch pad.  This is also the upper bound for each
54*080d01c4SChristoph Hellwig  * GC I/O, which helps to keep latency down.
55*080d01c4SChristoph Hellwig  */
56*080d01c4SChristoph Hellwig #define XFS_GC_CHUNK_SIZE	SZ_1M
57*080d01c4SChristoph Hellwig 
58*080d01c4SChristoph Hellwig /*
59*080d01c4SChristoph Hellwig  * Scratchpad data to read GCed data into.
60*080d01c4SChristoph Hellwig  *
61*080d01c4SChristoph Hellwig  * The offset member tracks where the next allocation starts, and freed tracks
62*080d01c4SChristoph Hellwig  * the amount of space that is not used anymore.
63*080d01c4SChristoph Hellwig  */
64*080d01c4SChristoph Hellwig #define XFS_ZONE_GC_NR_SCRATCH	2
65*080d01c4SChristoph Hellwig struct xfs_zone_scratch {
66*080d01c4SChristoph Hellwig 	struct folio			*folio;
67*080d01c4SChristoph Hellwig 	unsigned int			offset;
68*080d01c4SChristoph Hellwig 	unsigned int			freed;
69*080d01c4SChristoph Hellwig };
70*080d01c4SChristoph Hellwig 
71*080d01c4SChristoph Hellwig /*
72*080d01c4SChristoph Hellwig  * Chunk that is read and written for each GC operation.
73*080d01c4SChristoph Hellwig  *
74*080d01c4SChristoph Hellwig  * Note that for writes to actual zoned devices, the chunk can be split when
75*080d01c4SChristoph Hellwig  * reaching the hardware limit.
76*080d01c4SChristoph Hellwig  */
77*080d01c4SChristoph Hellwig struct xfs_gc_bio {
78*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data		*data;
79*080d01c4SChristoph Hellwig 
80*080d01c4SChristoph Hellwig 	/*
81*080d01c4SChristoph Hellwig 	 * Entry into the reading/writing/resetting list.  Only accessed from
82*080d01c4SChristoph Hellwig 	 * the GC thread, so no locking needed.
83*080d01c4SChristoph Hellwig 	 */
84*080d01c4SChristoph Hellwig 	struct list_head		entry;
85*080d01c4SChristoph Hellwig 
86*080d01c4SChristoph Hellwig 	/*
87*080d01c4SChristoph Hellwig 	 * State of this gc_bio.  Done means the current I/O completed.
88*080d01c4SChristoph Hellwig 	 * Set from the bio end I/O handler, read from the GC thread.
89*080d01c4SChristoph Hellwig 	 */
90*080d01c4SChristoph Hellwig 	enum {
91*080d01c4SChristoph Hellwig 		XFS_GC_BIO_NEW,
92*080d01c4SChristoph Hellwig 		XFS_GC_BIO_DONE,
93*080d01c4SChristoph Hellwig 	} state;
94*080d01c4SChristoph Hellwig 
95*080d01c4SChristoph Hellwig 	/*
96*080d01c4SChristoph Hellwig 	 * Pointer to the inode and byte range in the inode that this
97*080d01c4SChristoph Hellwig 	 * GC chunk is operating on.
98*080d01c4SChristoph Hellwig 	 */
99*080d01c4SChristoph Hellwig 	struct xfs_inode		*ip;
100*080d01c4SChristoph Hellwig 	loff_t				offset;
101*080d01c4SChristoph Hellwig 	unsigned int			len;
102*080d01c4SChristoph Hellwig 
103*080d01c4SChristoph Hellwig 	/*
104*080d01c4SChristoph Hellwig 	 * Existing startblock (in the zone to be freed) and newly assigned
105*080d01c4SChristoph Hellwig 	 * daddr in the zone GCed into.
106*080d01c4SChristoph Hellwig 	 */
107*080d01c4SChristoph Hellwig 	xfs_fsblock_t			old_startblock;
108*080d01c4SChristoph Hellwig 	xfs_daddr_t			new_daddr;
109*080d01c4SChristoph Hellwig 	struct xfs_zone_scratch		*scratch;
110*080d01c4SChristoph Hellwig 
111*080d01c4SChristoph Hellwig 	/* Are we writing to a sequential write required zone? */
112*080d01c4SChristoph Hellwig 	bool				is_seq;
113*080d01c4SChristoph Hellwig 
114*080d01c4SChristoph Hellwig 	/* Open Zone being written to */
115*080d01c4SChristoph Hellwig 	struct xfs_open_zone		*oz;
116*080d01c4SChristoph Hellwig 
117*080d01c4SChristoph Hellwig 	/* Bio used for reads and writes, including the bvec used by it */
118*080d01c4SChristoph Hellwig 	struct bio_vec			bv;
119*080d01c4SChristoph Hellwig 	struct bio			bio;	/* must be last */
120*080d01c4SChristoph Hellwig };
121*080d01c4SChristoph Hellwig 
122*080d01c4SChristoph Hellwig #define XFS_ZONE_GC_RECS		1024
123*080d01c4SChristoph Hellwig 
124*080d01c4SChristoph Hellwig /* iterator, needs to be reinitialized for each victim zone */
125*080d01c4SChristoph Hellwig struct xfs_zone_gc_iter {
126*080d01c4SChristoph Hellwig 	struct xfs_rtgroup		*victim_rtg;
127*080d01c4SChristoph Hellwig 	unsigned int			rec_count;
128*080d01c4SChristoph Hellwig 	unsigned int			rec_idx;
129*080d01c4SChristoph Hellwig 	xfs_agblock_t			next_startblock;
130*080d01c4SChristoph Hellwig 	struct xfs_rmap_irec		*recs;
131*080d01c4SChristoph Hellwig };
132*080d01c4SChristoph Hellwig 
133*080d01c4SChristoph Hellwig /*
134*080d01c4SChristoph Hellwig  * Per-mount GC state.
135*080d01c4SChristoph Hellwig  */
136*080d01c4SChristoph Hellwig struct xfs_zone_gc_data {
137*080d01c4SChristoph Hellwig 	struct xfs_mount		*mp;
138*080d01c4SChristoph Hellwig 
139*080d01c4SChristoph Hellwig 	/* bioset used to allocate the gc_bios */
140*080d01c4SChristoph Hellwig 	struct bio_set			bio_set;
141*080d01c4SChristoph Hellwig 
142*080d01c4SChristoph Hellwig 	/*
143*080d01c4SChristoph Hellwig 	 * Scratchpad used, and index to indicated which one is used.
144*080d01c4SChristoph Hellwig 	 */
145*080d01c4SChristoph Hellwig 	struct xfs_zone_scratch		scratch[XFS_ZONE_GC_NR_SCRATCH];
146*080d01c4SChristoph Hellwig 	unsigned int			scratch_idx;
147*080d01c4SChristoph Hellwig 
148*080d01c4SChristoph Hellwig 	/*
149*080d01c4SChristoph Hellwig 	 * List of bios currently being read, written and reset.
150*080d01c4SChristoph Hellwig 	 * These lists are only accessed by the GC thread itself, and must only
151*080d01c4SChristoph Hellwig 	 * be processed in order.
152*080d01c4SChristoph Hellwig 	 */
153*080d01c4SChristoph Hellwig 	struct list_head		reading;
154*080d01c4SChristoph Hellwig 	struct list_head		writing;
155*080d01c4SChristoph Hellwig 	struct list_head		resetting;
156*080d01c4SChristoph Hellwig 
157*080d01c4SChristoph Hellwig 	/*
158*080d01c4SChristoph Hellwig 	 * Iterator for the victim zone.
159*080d01c4SChristoph Hellwig 	 */
160*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_iter		iter;
161*080d01c4SChristoph Hellwig };
162*080d01c4SChristoph Hellwig 
163*080d01c4SChristoph Hellwig /*
164*080d01c4SChristoph Hellwig  * We aim to keep enough zones free in stock to fully use the open zone limit
165*080d01c4SChristoph Hellwig  * for data placement purposes.
166*080d01c4SChristoph Hellwig  */
167*080d01c4SChristoph Hellwig bool
168*080d01c4SChristoph Hellwig xfs_zoned_need_gc(
169*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp)
170*080d01c4SChristoph Hellwig {
171*080d01c4SChristoph Hellwig 	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
172*080d01c4SChristoph Hellwig 		return false;
173*080d01c4SChristoph Hellwig 	if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) <
174*080d01c4SChristoph Hellwig 	    mp->m_groups[XG_TYPE_RTG].blocks *
175*080d01c4SChristoph Hellwig 	    (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
176*080d01c4SChristoph Hellwig 		return true;
177*080d01c4SChristoph Hellwig 	return false;
178*080d01c4SChristoph Hellwig }
179*080d01c4SChristoph Hellwig 
180*080d01c4SChristoph Hellwig static struct xfs_zone_gc_data *
181*080d01c4SChristoph Hellwig xfs_zone_gc_data_alloc(
182*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp)
183*080d01c4SChristoph Hellwig {
184*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data;
185*080d01c4SChristoph Hellwig 	int			i;
186*080d01c4SChristoph Hellwig 
187*080d01c4SChristoph Hellwig 	data = kzalloc(sizeof(*data), GFP_KERNEL);
188*080d01c4SChristoph Hellwig 	if (!data)
189*080d01c4SChristoph Hellwig 		return NULL;
190*080d01c4SChristoph Hellwig 	data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
191*080d01c4SChristoph Hellwig 			GFP_KERNEL);
192*080d01c4SChristoph Hellwig 	if (!data->iter.recs)
193*080d01c4SChristoph Hellwig 		goto out_free_data;
194*080d01c4SChristoph Hellwig 
195*080d01c4SChristoph Hellwig 	/*
196*080d01c4SChristoph Hellwig 	 * We actually only need a single bio_vec.  It would be nice to have
197*080d01c4SChristoph Hellwig 	 * a flag that only allocates the inline bvecs and not the separate
198*080d01c4SChristoph Hellwig 	 * bvec pool.
199*080d01c4SChristoph Hellwig 	 */
200*080d01c4SChristoph Hellwig 	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
201*080d01c4SChristoph Hellwig 			BIOSET_NEED_BVECS))
202*080d01c4SChristoph Hellwig 		goto out_free_recs;
203*080d01c4SChristoph Hellwig 	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
204*080d01c4SChristoph Hellwig 		data->scratch[i].folio =
205*080d01c4SChristoph Hellwig 			folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
206*080d01c4SChristoph Hellwig 		if (!data->scratch[i].folio)
207*080d01c4SChristoph Hellwig 			goto out_free_scratch;
208*080d01c4SChristoph Hellwig 	}
209*080d01c4SChristoph Hellwig 	INIT_LIST_HEAD(&data->reading);
210*080d01c4SChristoph Hellwig 	INIT_LIST_HEAD(&data->writing);
211*080d01c4SChristoph Hellwig 	INIT_LIST_HEAD(&data->resetting);
212*080d01c4SChristoph Hellwig 	data->mp = mp;
213*080d01c4SChristoph Hellwig 	return data;
214*080d01c4SChristoph Hellwig 
215*080d01c4SChristoph Hellwig out_free_scratch:
216*080d01c4SChristoph Hellwig 	while (--i >= 0)
217*080d01c4SChristoph Hellwig 		folio_put(data->scratch[i].folio);
218*080d01c4SChristoph Hellwig 	bioset_exit(&data->bio_set);
219*080d01c4SChristoph Hellwig out_free_recs:
220*080d01c4SChristoph Hellwig 	kfree(data->iter.recs);
221*080d01c4SChristoph Hellwig out_free_data:
222*080d01c4SChristoph Hellwig 	kfree(data);
223*080d01c4SChristoph Hellwig 	return NULL;
224*080d01c4SChristoph Hellwig }
225*080d01c4SChristoph Hellwig 
226*080d01c4SChristoph Hellwig static void
227*080d01c4SChristoph Hellwig xfs_zone_gc_data_free(
228*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data)
229*080d01c4SChristoph Hellwig {
230*080d01c4SChristoph Hellwig 	int			i;
231*080d01c4SChristoph Hellwig 
232*080d01c4SChristoph Hellwig 	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
233*080d01c4SChristoph Hellwig 		folio_put(data->scratch[i].folio);
234*080d01c4SChristoph Hellwig 	bioset_exit(&data->bio_set);
235*080d01c4SChristoph Hellwig 	kfree(data->iter.recs);
236*080d01c4SChristoph Hellwig 	kfree(data);
237*080d01c4SChristoph Hellwig }
238*080d01c4SChristoph Hellwig 
239*080d01c4SChristoph Hellwig static void
240*080d01c4SChristoph Hellwig xfs_zone_gc_iter_init(
241*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_iter	*iter,
242*080d01c4SChristoph Hellwig 	struct xfs_rtgroup	*victim_rtg)
243*080d01c4SChristoph Hellwig 
244*080d01c4SChristoph Hellwig {
245*080d01c4SChristoph Hellwig 	iter->next_startblock = 0;
246*080d01c4SChristoph Hellwig 	iter->rec_count = 0;
247*080d01c4SChristoph Hellwig 	iter->rec_idx = 0;
248*080d01c4SChristoph Hellwig 	iter->victim_rtg = victim_rtg;
249*080d01c4SChristoph Hellwig }
250*080d01c4SChristoph Hellwig 
251*080d01c4SChristoph Hellwig /*
252*080d01c4SChristoph Hellwig  * Query the rmap of the victim zone to gather the records to evacuate.
253*080d01c4SChristoph Hellwig  */
254*080d01c4SChristoph Hellwig static int
255*080d01c4SChristoph Hellwig xfs_zone_gc_query_cb(
256*080d01c4SChristoph Hellwig 	struct xfs_btree_cur	*cur,
257*080d01c4SChristoph Hellwig 	const struct xfs_rmap_irec *irec,
258*080d01c4SChristoph Hellwig 	void			*private)
259*080d01c4SChristoph Hellwig {
260*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_iter	*iter = private;
261*080d01c4SChristoph Hellwig 
262*080d01c4SChristoph Hellwig 	ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
263*080d01c4SChristoph Hellwig 	ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
264*080d01c4SChristoph Hellwig 	ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
265*080d01c4SChristoph Hellwig 
266*080d01c4SChristoph Hellwig 	iter->recs[iter->rec_count] = *irec;
267*080d01c4SChristoph Hellwig 	if (++iter->rec_count == XFS_ZONE_GC_RECS) {
268*080d01c4SChristoph Hellwig 		iter->next_startblock =
269*080d01c4SChristoph Hellwig 			irec->rm_startblock + irec->rm_blockcount;
270*080d01c4SChristoph Hellwig 		return 1;
271*080d01c4SChristoph Hellwig 	}
272*080d01c4SChristoph Hellwig 	return 0;
273*080d01c4SChristoph Hellwig }
274*080d01c4SChristoph Hellwig 
275*080d01c4SChristoph Hellwig #define cmp_int(l, r)		((l > r) - (l < r))
276*080d01c4SChristoph Hellwig 
277*080d01c4SChristoph Hellwig static int
278*080d01c4SChristoph Hellwig xfs_zone_gc_rmap_rec_cmp(
279*080d01c4SChristoph Hellwig 	const void			*a,
280*080d01c4SChristoph Hellwig 	const void			*b)
281*080d01c4SChristoph Hellwig {
282*080d01c4SChristoph Hellwig 	const struct xfs_rmap_irec	*reca = a;
283*080d01c4SChristoph Hellwig 	const struct xfs_rmap_irec	*recb = b;
284*080d01c4SChristoph Hellwig 	int				diff;
285*080d01c4SChristoph Hellwig 
286*080d01c4SChristoph Hellwig 	diff = cmp_int(reca->rm_owner, recb->rm_owner);
287*080d01c4SChristoph Hellwig 	if (diff)
288*080d01c4SChristoph Hellwig 		return diff;
289*080d01c4SChristoph Hellwig 	return cmp_int(reca->rm_offset, recb->rm_offset);
290*080d01c4SChristoph Hellwig }
291*080d01c4SChristoph Hellwig 
292*080d01c4SChristoph Hellwig static int
293*080d01c4SChristoph Hellwig xfs_zone_gc_query(
294*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp,
295*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_iter	*iter)
296*080d01c4SChristoph Hellwig {
297*080d01c4SChristoph Hellwig 	struct xfs_rtgroup	*rtg = iter->victim_rtg;
298*080d01c4SChristoph Hellwig 	struct xfs_rmap_irec	ri_low = { };
299*080d01c4SChristoph Hellwig 	struct xfs_rmap_irec	ri_high;
300*080d01c4SChristoph Hellwig 	struct xfs_btree_cur	*cur;
301*080d01c4SChristoph Hellwig 	struct xfs_trans	*tp;
302*080d01c4SChristoph Hellwig 	int			error;
303*080d01c4SChristoph Hellwig 
304*080d01c4SChristoph Hellwig 	ASSERT(iter->next_startblock <= rtg_blocks(rtg));
305*080d01c4SChristoph Hellwig 	if (iter->next_startblock == rtg_blocks(rtg))
306*080d01c4SChristoph Hellwig 		goto done;
307*080d01c4SChristoph Hellwig 
308*080d01c4SChristoph Hellwig 	ASSERT(iter->next_startblock < rtg_blocks(rtg));
309*080d01c4SChristoph Hellwig 	ri_low.rm_startblock = iter->next_startblock;
310*080d01c4SChristoph Hellwig 	memset(&ri_high, 0xFF, sizeof(ri_high));
311*080d01c4SChristoph Hellwig 
312*080d01c4SChristoph Hellwig 	iter->rec_idx = 0;
313*080d01c4SChristoph Hellwig 	iter->rec_count = 0;
314*080d01c4SChristoph Hellwig 
315*080d01c4SChristoph Hellwig 	error = xfs_trans_alloc_empty(mp, &tp);
316*080d01c4SChristoph Hellwig 	if (error)
317*080d01c4SChristoph Hellwig 		return error;
318*080d01c4SChristoph Hellwig 
319*080d01c4SChristoph Hellwig 	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
320*080d01c4SChristoph Hellwig 	cur = xfs_rtrmapbt_init_cursor(tp, rtg);
321*080d01c4SChristoph Hellwig 	error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
322*080d01c4SChristoph Hellwig 			xfs_zone_gc_query_cb, iter);
323*080d01c4SChristoph Hellwig 	xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
324*080d01c4SChristoph Hellwig 	xfs_btree_del_cursor(cur, error < 0 ? error : 0);
325*080d01c4SChristoph Hellwig 	xfs_trans_cancel(tp);
326*080d01c4SChristoph Hellwig 
327*080d01c4SChristoph Hellwig 	if (error < 0)
328*080d01c4SChristoph Hellwig 		return error;
329*080d01c4SChristoph Hellwig 
330*080d01c4SChristoph Hellwig 	/*
331*080d01c4SChristoph Hellwig 	 * Sort the rmap records by inode number and increasing offset to
332*080d01c4SChristoph Hellwig 	 * defragment the mappings.
333*080d01c4SChristoph Hellwig 	 *
334*080d01c4SChristoph Hellwig 	 * This could be further enhanced by an even bigger look ahead window,
335*080d01c4SChristoph Hellwig 	 * but that's better left until we have better detection of changes to
336*080d01c4SChristoph Hellwig 	 * inode mapping to avoid the potential of GCing already dead data.
337*080d01c4SChristoph Hellwig 	 */
338*080d01c4SChristoph Hellwig 	sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
339*080d01c4SChristoph Hellwig 			xfs_zone_gc_rmap_rec_cmp, NULL);
340*080d01c4SChristoph Hellwig 
341*080d01c4SChristoph Hellwig 	if (error == 0) {
342*080d01c4SChristoph Hellwig 		/*
343*080d01c4SChristoph Hellwig 		 * We finished iterating through the zone.
344*080d01c4SChristoph Hellwig 		 */
345*080d01c4SChristoph Hellwig 		iter->next_startblock = rtg_blocks(rtg);
346*080d01c4SChristoph Hellwig 		if (iter->rec_count == 0)
347*080d01c4SChristoph Hellwig 			goto done;
348*080d01c4SChristoph Hellwig 	}
349*080d01c4SChristoph Hellwig 
350*080d01c4SChristoph Hellwig 	return 0;
351*080d01c4SChristoph Hellwig done:
352*080d01c4SChristoph Hellwig 	xfs_rtgroup_rele(iter->victim_rtg);
353*080d01c4SChristoph Hellwig 	iter->victim_rtg = NULL;
354*080d01c4SChristoph Hellwig 	return 0;
355*080d01c4SChristoph Hellwig }
356*080d01c4SChristoph Hellwig 
357*080d01c4SChristoph Hellwig static bool
358*080d01c4SChristoph Hellwig xfs_zone_gc_iter_next(
359*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp,
360*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_iter	*iter,
361*080d01c4SChristoph Hellwig 	struct xfs_rmap_irec	*chunk_rec,
362*080d01c4SChristoph Hellwig 	struct xfs_inode	**ipp)
363*080d01c4SChristoph Hellwig {
364*080d01c4SChristoph Hellwig 	struct xfs_rmap_irec	*irec;
365*080d01c4SChristoph Hellwig 	int			error;
366*080d01c4SChristoph Hellwig 
367*080d01c4SChristoph Hellwig 	if (!iter->victim_rtg)
368*080d01c4SChristoph Hellwig 		return false;
369*080d01c4SChristoph Hellwig 
370*080d01c4SChristoph Hellwig retry:
371*080d01c4SChristoph Hellwig 	if (iter->rec_idx == iter->rec_count) {
372*080d01c4SChristoph Hellwig 		error = xfs_zone_gc_query(mp, iter);
373*080d01c4SChristoph Hellwig 		if (error)
374*080d01c4SChristoph Hellwig 			goto fail;
375*080d01c4SChristoph Hellwig 		if (!iter->victim_rtg)
376*080d01c4SChristoph Hellwig 			return false;
377*080d01c4SChristoph Hellwig 	}
378*080d01c4SChristoph Hellwig 
379*080d01c4SChristoph Hellwig 	irec = &iter->recs[iter->rec_idx];
380*080d01c4SChristoph Hellwig 	error = xfs_iget(mp, NULL, irec->rm_owner,
381*080d01c4SChristoph Hellwig 			XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
382*080d01c4SChristoph Hellwig 	if (error) {
383*080d01c4SChristoph Hellwig 		/*
384*080d01c4SChristoph Hellwig 		 * If the inode was already deleted, skip over it.
385*080d01c4SChristoph Hellwig 		 */
386*080d01c4SChristoph Hellwig 		if (error == -ENOENT) {
387*080d01c4SChristoph Hellwig 			iter->rec_idx++;
388*080d01c4SChristoph Hellwig 			goto retry;
389*080d01c4SChristoph Hellwig 		}
390*080d01c4SChristoph Hellwig 		goto fail;
391*080d01c4SChristoph Hellwig 	}
392*080d01c4SChristoph Hellwig 
393*080d01c4SChristoph Hellwig 	if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
394*080d01c4SChristoph Hellwig 		iter->rec_idx++;
395*080d01c4SChristoph Hellwig 		xfs_irele(*ipp);
396*080d01c4SChristoph Hellwig 		goto retry;
397*080d01c4SChristoph Hellwig 	}
398*080d01c4SChristoph Hellwig 
399*080d01c4SChristoph Hellwig 	*chunk_rec = *irec;
400*080d01c4SChristoph Hellwig 	return true;
401*080d01c4SChristoph Hellwig 
402*080d01c4SChristoph Hellwig fail:
403*080d01c4SChristoph Hellwig 	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
404*080d01c4SChristoph Hellwig 	return false;
405*080d01c4SChristoph Hellwig }
406*080d01c4SChristoph Hellwig 
407*080d01c4SChristoph Hellwig static void
408*080d01c4SChristoph Hellwig xfs_zone_gc_iter_advance(
409*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_iter	*iter,
410*080d01c4SChristoph Hellwig 	xfs_extlen_t		count_fsb)
411*080d01c4SChristoph Hellwig {
412*080d01c4SChristoph Hellwig 	struct xfs_rmap_irec	*irec = &iter->recs[iter->rec_idx];
413*080d01c4SChristoph Hellwig 
414*080d01c4SChristoph Hellwig 	irec->rm_offset += count_fsb;
415*080d01c4SChristoph Hellwig 	irec->rm_startblock += count_fsb;
416*080d01c4SChristoph Hellwig 	irec->rm_blockcount -= count_fsb;
417*080d01c4SChristoph Hellwig 	if (!irec->rm_blockcount)
418*080d01c4SChristoph Hellwig 		iter->rec_idx++;
419*080d01c4SChristoph Hellwig }
420*080d01c4SChristoph Hellwig 
421*080d01c4SChristoph Hellwig static struct xfs_rtgroup *
422*080d01c4SChristoph Hellwig xfs_zone_gc_pick_victim_from(
423*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp,
424*080d01c4SChristoph Hellwig 	uint32_t		bucket)
425*080d01c4SChristoph Hellwig {
426*080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi = mp->m_zone_info;
427*080d01c4SChristoph Hellwig 	uint32_t		victim_used = U32_MAX;
428*080d01c4SChristoph Hellwig 	struct xfs_rtgroup	*victim_rtg = NULL;
429*080d01c4SChristoph Hellwig 	uint32_t		bit;
430*080d01c4SChristoph Hellwig 
431*080d01c4SChristoph Hellwig 	if (!zi->zi_used_bucket_entries[bucket])
432*080d01c4SChristoph Hellwig 		return NULL;
433*080d01c4SChristoph Hellwig 
434*080d01c4SChristoph Hellwig 	for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
435*080d01c4SChristoph Hellwig 			mp->m_sb.sb_rgcount) {
436*080d01c4SChristoph Hellwig 		struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
437*080d01c4SChristoph Hellwig 
438*080d01c4SChristoph Hellwig 		if (!rtg)
439*080d01c4SChristoph Hellwig 			continue;
440*080d01c4SChristoph Hellwig 
441*080d01c4SChristoph Hellwig 		/* skip zones that are just waiting for a reset */
442*080d01c4SChristoph Hellwig 		if (rtg_rmap(rtg)->i_used_blocks == 0 ||
443*080d01c4SChristoph Hellwig 		    rtg_rmap(rtg)->i_used_blocks >= victim_used) {
444*080d01c4SChristoph Hellwig 			xfs_rtgroup_rele(rtg);
445*080d01c4SChristoph Hellwig 			continue;
446*080d01c4SChristoph Hellwig 		}
447*080d01c4SChristoph Hellwig 
448*080d01c4SChristoph Hellwig 		if (victim_rtg)
449*080d01c4SChristoph Hellwig 			xfs_rtgroup_rele(victim_rtg);
450*080d01c4SChristoph Hellwig 		victim_rtg = rtg;
451*080d01c4SChristoph Hellwig 		victim_used = rtg_rmap(rtg)->i_used_blocks;
452*080d01c4SChristoph Hellwig 
453*080d01c4SChristoph Hellwig 		/*
454*080d01c4SChristoph Hellwig 		 * Any zone that is less than 1 percent used is fair game for
455*080d01c4SChristoph Hellwig 		 * instant reclaim. All of these zones are in the last
456*080d01c4SChristoph Hellwig 		 * bucket, so avoid the expensive division for the zones
457*080d01c4SChristoph Hellwig 		 * in the other buckets.
458*080d01c4SChristoph Hellwig 		 */
459*080d01c4SChristoph Hellwig 		if (bucket == 0 &&
460*080d01c4SChristoph Hellwig 		    rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
461*080d01c4SChristoph Hellwig 			break;
462*080d01c4SChristoph Hellwig 	}
463*080d01c4SChristoph Hellwig 
464*080d01c4SChristoph Hellwig 	return victim_rtg;
465*080d01c4SChristoph Hellwig }
466*080d01c4SChristoph Hellwig 
467*080d01c4SChristoph Hellwig /*
468*080d01c4SChristoph Hellwig  * Iterate through all zones marked as reclaimable and find a candidate to
469*080d01c4SChristoph Hellwig  * reclaim.
470*080d01c4SChristoph Hellwig  */
471*080d01c4SChristoph Hellwig static bool
472*080d01c4SChristoph Hellwig xfs_zone_gc_select_victim(
473*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data)
474*080d01c4SChristoph Hellwig {
475*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_iter	*iter = &data->iter;
476*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp = data->mp;
477*080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi = mp->m_zone_info;
478*080d01c4SChristoph Hellwig 	struct xfs_rtgroup	*victim_rtg = NULL;
479*080d01c4SChristoph Hellwig 	unsigned int		bucket;
480*080d01c4SChristoph Hellwig 
481*080d01c4SChristoph Hellwig 	if (xfs_is_shutdown(mp))
482*080d01c4SChristoph Hellwig 		return false;
483*080d01c4SChristoph Hellwig 
484*080d01c4SChristoph Hellwig 	if (iter->victim_rtg)
485*080d01c4SChristoph Hellwig 		return true;
486*080d01c4SChristoph Hellwig 
487*080d01c4SChristoph Hellwig 	/*
488*080d01c4SChristoph Hellwig 	 * Don't start new work if we are asked to stop or park.
489*080d01c4SChristoph Hellwig 	 */
490*080d01c4SChristoph Hellwig 	if (kthread_should_stop() || kthread_should_park())
491*080d01c4SChristoph Hellwig 		return false;
492*080d01c4SChristoph Hellwig 
493*080d01c4SChristoph Hellwig 	if (!xfs_zoned_need_gc(mp))
494*080d01c4SChristoph Hellwig 		return false;
495*080d01c4SChristoph Hellwig 
496*080d01c4SChristoph Hellwig 	spin_lock(&zi->zi_used_buckets_lock);
497*080d01c4SChristoph Hellwig 	for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
498*080d01c4SChristoph Hellwig 		victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
499*080d01c4SChristoph Hellwig 		if (victim_rtg)
500*080d01c4SChristoph Hellwig 			break;
501*080d01c4SChristoph Hellwig 	}
502*080d01c4SChristoph Hellwig 	spin_unlock(&zi->zi_used_buckets_lock);
503*080d01c4SChristoph Hellwig 
504*080d01c4SChristoph Hellwig 	if (!victim_rtg)
505*080d01c4SChristoph Hellwig 		return false;
506*080d01c4SChristoph Hellwig 
507*080d01c4SChristoph Hellwig 	trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
508*080d01c4SChristoph Hellwig 	xfs_zone_gc_iter_init(iter, victim_rtg);
509*080d01c4SChristoph Hellwig 	return true;
510*080d01c4SChristoph Hellwig }
511*080d01c4SChristoph Hellwig 
512*080d01c4SChristoph Hellwig static struct xfs_open_zone *
513*080d01c4SChristoph Hellwig xfs_zone_gc_steal_open(
514*080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi)
515*080d01c4SChristoph Hellwig {
516*080d01c4SChristoph Hellwig 	struct xfs_open_zone	*oz, *found = NULL;
517*080d01c4SChristoph Hellwig 
518*080d01c4SChristoph Hellwig 	spin_lock(&zi->zi_open_zones_lock);
519*080d01c4SChristoph Hellwig 	list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
520*080d01c4SChristoph Hellwig 		if (!found ||
521*080d01c4SChristoph Hellwig 		    oz->oz_write_pointer < found->oz_write_pointer)
522*080d01c4SChristoph Hellwig 			found = oz;
523*080d01c4SChristoph Hellwig 	}
524*080d01c4SChristoph Hellwig 
525*080d01c4SChristoph Hellwig 	if (found) {
526*080d01c4SChristoph Hellwig 		found->oz_is_gc = true;
527*080d01c4SChristoph Hellwig 		list_del_init(&found->oz_entry);
528*080d01c4SChristoph Hellwig 		zi->zi_nr_open_zones--;
529*080d01c4SChristoph Hellwig 	}
530*080d01c4SChristoph Hellwig 
531*080d01c4SChristoph Hellwig 	spin_unlock(&zi->zi_open_zones_lock);
532*080d01c4SChristoph Hellwig 	return found;
533*080d01c4SChristoph Hellwig }
534*080d01c4SChristoph Hellwig 
535*080d01c4SChristoph Hellwig static struct xfs_open_zone *
536*080d01c4SChristoph Hellwig xfs_zone_gc_select_target(
537*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp)
538*080d01c4SChristoph Hellwig {
539*080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi = mp->m_zone_info;
540*080d01c4SChristoph Hellwig 	struct xfs_open_zone	*oz = zi->zi_open_gc_zone;
541*080d01c4SChristoph Hellwig 
542*080d01c4SChristoph Hellwig 	/*
543*080d01c4SChristoph Hellwig 	 * We need to wait for pending writes to finish.
544*080d01c4SChristoph Hellwig 	 */
545*080d01c4SChristoph Hellwig 	if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
546*080d01c4SChristoph Hellwig 		return NULL;
547*080d01c4SChristoph Hellwig 
548*080d01c4SChristoph Hellwig 	ASSERT(zi->zi_nr_open_zones <=
549*080d01c4SChristoph Hellwig 		mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
550*080d01c4SChristoph Hellwig 	oz = xfs_open_zone(mp, true);
551*080d01c4SChristoph Hellwig 	if (oz)
552*080d01c4SChristoph Hellwig 		trace_xfs_zone_gc_target_opened(oz->oz_rtg);
553*080d01c4SChristoph Hellwig 	spin_lock(&zi->zi_open_zones_lock);
554*080d01c4SChristoph Hellwig 	zi->zi_open_gc_zone = oz;
555*080d01c4SChristoph Hellwig 	spin_unlock(&zi->zi_open_zones_lock);
556*080d01c4SChristoph Hellwig 	return oz;
557*080d01c4SChristoph Hellwig }
558*080d01c4SChristoph Hellwig 
559*080d01c4SChristoph Hellwig /*
560*080d01c4SChristoph Hellwig  * Ensure we have a valid open zone to write the GC data to.
561*080d01c4SChristoph Hellwig  *
562*080d01c4SChristoph Hellwig  * If the current target zone has space keep writing to it, else first wait for
563*080d01c4SChristoph Hellwig  * all pending writes and then pick a new one.
564*080d01c4SChristoph Hellwig  */
565*080d01c4SChristoph Hellwig static struct xfs_open_zone *
566*080d01c4SChristoph Hellwig xfs_zone_gc_ensure_target(
567*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp)
568*080d01c4SChristoph Hellwig {
569*080d01c4SChristoph Hellwig 	struct xfs_open_zone	*oz = mp->m_zone_info->zi_open_gc_zone;
570*080d01c4SChristoph Hellwig 
571*080d01c4SChristoph Hellwig 	if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
572*080d01c4SChristoph Hellwig 		return xfs_zone_gc_select_target(mp);
573*080d01c4SChristoph Hellwig 	return oz;
574*080d01c4SChristoph Hellwig }
575*080d01c4SChristoph Hellwig 
576*080d01c4SChristoph Hellwig static unsigned int
577*080d01c4SChristoph Hellwig xfs_zone_gc_scratch_available(
578*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data)
579*080d01c4SChristoph Hellwig {
580*080d01c4SChristoph Hellwig 	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
581*080d01c4SChristoph Hellwig }
582*080d01c4SChristoph Hellwig 
583*080d01c4SChristoph Hellwig static bool
584*080d01c4SChristoph Hellwig xfs_zone_gc_space_available(
585*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data)
586*080d01c4SChristoph Hellwig {
587*080d01c4SChristoph Hellwig 	struct xfs_open_zone	*oz;
588*080d01c4SChristoph Hellwig 
589*080d01c4SChristoph Hellwig 	oz = xfs_zone_gc_ensure_target(data->mp);
590*080d01c4SChristoph Hellwig 	if (!oz)
591*080d01c4SChristoph Hellwig 		return false;
592*080d01c4SChristoph Hellwig 	return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) &&
593*080d01c4SChristoph Hellwig 		xfs_zone_gc_scratch_available(data);
594*080d01c4SChristoph Hellwig }
595*080d01c4SChristoph Hellwig 
596*080d01c4SChristoph Hellwig static void
597*080d01c4SChristoph Hellwig xfs_zone_gc_end_io(
598*080d01c4SChristoph Hellwig 	struct bio		*bio)
599*080d01c4SChristoph Hellwig {
600*080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk =
601*080d01c4SChristoph Hellwig 		container_of(bio, struct xfs_gc_bio, bio);
602*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data = chunk->data;
603*080d01c4SChristoph Hellwig 
604*080d01c4SChristoph Hellwig 	WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
605*080d01c4SChristoph Hellwig 	wake_up_process(data->mp->m_zone_info->zi_gc_thread);
606*080d01c4SChristoph Hellwig }
607*080d01c4SChristoph Hellwig 
608*080d01c4SChristoph Hellwig static struct xfs_open_zone *
609*080d01c4SChristoph Hellwig xfs_zone_gc_alloc_blocks(
610*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data,
611*080d01c4SChristoph Hellwig 	xfs_extlen_t		*count_fsb,
612*080d01c4SChristoph Hellwig 	xfs_daddr_t		*daddr,
613*080d01c4SChristoph Hellwig 	bool			*is_seq)
614*080d01c4SChristoph Hellwig {
615*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp = data->mp;
616*080d01c4SChristoph Hellwig 	struct xfs_open_zone	*oz;
617*080d01c4SChristoph Hellwig 
618*080d01c4SChristoph Hellwig 	oz = xfs_zone_gc_ensure_target(mp);
619*080d01c4SChristoph Hellwig 	if (!oz)
620*080d01c4SChristoph Hellwig 		return NULL;
621*080d01c4SChristoph Hellwig 
622*080d01c4SChristoph Hellwig 	*count_fsb = min(*count_fsb,
623*080d01c4SChristoph Hellwig 		XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
624*080d01c4SChristoph Hellwig 
625*080d01c4SChristoph Hellwig 	/*
626*080d01c4SChristoph Hellwig 	 * Directly allocate GC blocks from the reserved pool.
627*080d01c4SChristoph Hellwig 	 *
628*080d01c4SChristoph Hellwig 	 * If we'd take them from the normal pool we could be stealing blocks
629*080d01c4SChristoph Hellwig 	 * from a regular writer, which would then have to wait for GC and
630*080d01c4SChristoph Hellwig 	 * deadlock.
631*080d01c4SChristoph Hellwig 	 */
632*080d01c4SChristoph Hellwig 	spin_lock(&mp->m_sb_lock);
633*080d01c4SChristoph Hellwig 	*count_fsb = min(*count_fsb,
634*080d01c4SChristoph Hellwig 			rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer);
635*080d01c4SChristoph Hellwig 	*count_fsb = min3(*count_fsb,
636*080d01c4SChristoph Hellwig 			mp->m_free[XC_FREE_RTEXTENTS].res_avail,
637*080d01c4SChristoph Hellwig 			mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
638*080d01c4SChristoph Hellwig 	mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
639*080d01c4SChristoph Hellwig 	mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
640*080d01c4SChristoph Hellwig 	spin_unlock(&mp->m_sb_lock);
641*080d01c4SChristoph Hellwig 
642*080d01c4SChristoph Hellwig 	if (!*count_fsb)
643*080d01c4SChristoph Hellwig 		return NULL;
644*080d01c4SChristoph Hellwig 
645*080d01c4SChristoph Hellwig 	*daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
646*080d01c4SChristoph Hellwig 	*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
647*080d01c4SChristoph Hellwig 	if (!*is_seq)
648*080d01c4SChristoph Hellwig 		*daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer);
649*080d01c4SChristoph Hellwig 	oz->oz_write_pointer += *count_fsb;
650*080d01c4SChristoph Hellwig 	atomic_inc(&oz->oz_ref);
651*080d01c4SChristoph Hellwig 	return oz;
652*080d01c4SChristoph Hellwig }
653*080d01c4SChristoph Hellwig 
654*080d01c4SChristoph Hellwig static bool
655*080d01c4SChristoph Hellwig xfs_zone_gc_start_chunk(
656*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data)
657*080d01c4SChristoph Hellwig {
658*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_iter	*iter = &data->iter;
659*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp = data->mp;
660*080d01c4SChristoph Hellwig 	struct block_device	*bdev = mp->m_rtdev_targp->bt_bdev;
661*080d01c4SChristoph Hellwig 	struct xfs_open_zone	*oz;
662*080d01c4SChristoph Hellwig 	struct xfs_rmap_irec	irec;
663*080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk;
664*080d01c4SChristoph Hellwig 	struct xfs_inode	*ip;
665*080d01c4SChristoph Hellwig 	struct bio		*bio;
666*080d01c4SChristoph Hellwig 	xfs_daddr_t		daddr;
667*080d01c4SChristoph Hellwig 	bool			is_seq;
668*080d01c4SChristoph Hellwig 
669*080d01c4SChristoph Hellwig 	if (xfs_is_shutdown(mp))
670*080d01c4SChristoph Hellwig 		return false;
671*080d01c4SChristoph Hellwig 
672*080d01c4SChristoph Hellwig 	if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
673*080d01c4SChristoph Hellwig 		return false;
674*080d01c4SChristoph Hellwig 	oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
675*080d01c4SChristoph Hellwig 			&is_seq);
676*080d01c4SChristoph Hellwig 	if (!oz) {
677*080d01c4SChristoph Hellwig 		xfs_irele(ip);
678*080d01c4SChristoph Hellwig 		return false;
679*080d01c4SChristoph Hellwig 	}
680*080d01c4SChristoph Hellwig 
681*080d01c4SChristoph Hellwig 	bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
682*080d01c4SChristoph Hellwig 
683*080d01c4SChristoph Hellwig 	chunk = container_of(bio, struct xfs_gc_bio, bio);
684*080d01c4SChristoph Hellwig 	chunk->ip = ip;
685*080d01c4SChristoph Hellwig 	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
686*080d01c4SChristoph Hellwig 	chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
687*080d01c4SChristoph Hellwig 	chunk->old_startblock =
688*080d01c4SChristoph Hellwig 		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
689*080d01c4SChristoph Hellwig 	chunk->new_daddr = daddr;
690*080d01c4SChristoph Hellwig 	chunk->is_seq = is_seq;
691*080d01c4SChristoph Hellwig 	chunk->scratch = &data->scratch[data->scratch_idx];
692*080d01c4SChristoph Hellwig 	chunk->data = data;
693*080d01c4SChristoph Hellwig 	chunk->oz = oz;
694*080d01c4SChristoph Hellwig 
695*080d01c4SChristoph Hellwig 	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
696*080d01c4SChristoph Hellwig 	bio->bi_end_io = xfs_zone_gc_end_io;
697*080d01c4SChristoph Hellwig 	bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
698*080d01c4SChristoph Hellwig 			chunk->scratch->offset);
699*080d01c4SChristoph Hellwig 	chunk->scratch->offset += chunk->len;
700*080d01c4SChristoph Hellwig 	if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
701*080d01c4SChristoph Hellwig 		data->scratch_idx =
702*080d01c4SChristoph Hellwig 			(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
703*080d01c4SChristoph Hellwig 	}
704*080d01c4SChristoph Hellwig 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
705*080d01c4SChristoph Hellwig 	list_add_tail(&chunk->entry, &data->reading);
706*080d01c4SChristoph Hellwig 	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
707*080d01c4SChristoph Hellwig 
708*080d01c4SChristoph Hellwig 	submit_bio(bio);
709*080d01c4SChristoph Hellwig 	return true;
710*080d01c4SChristoph Hellwig }
711*080d01c4SChristoph Hellwig 
712*080d01c4SChristoph Hellwig static void
713*080d01c4SChristoph Hellwig xfs_zone_gc_free_chunk(
714*080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk)
715*080d01c4SChristoph Hellwig {
716*080d01c4SChristoph Hellwig 	list_del(&chunk->entry);
717*080d01c4SChristoph Hellwig 	xfs_open_zone_put(chunk->oz);
718*080d01c4SChristoph Hellwig 	xfs_irele(chunk->ip);
719*080d01c4SChristoph Hellwig 	bio_put(&chunk->bio);
720*080d01c4SChristoph Hellwig }
721*080d01c4SChristoph Hellwig 
722*080d01c4SChristoph Hellwig static void
723*080d01c4SChristoph Hellwig xfs_zone_gc_submit_write(
724*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data,
725*080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk)
726*080d01c4SChristoph Hellwig {
727*080d01c4SChristoph Hellwig 	if (chunk->is_seq) {
728*080d01c4SChristoph Hellwig 		chunk->bio.bi_opf &= ~REQ_OP_WRITE;
729*080d01c4SChristoph Hellwig 		chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
730*080d01c4SChristoph Hellwig 	}
731*080d01c4SChristoph Hellwig 	chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
732*080d01c4SChristoph Hellwig 	chunk->bio.bi_end_io = xfs_zone_gc_end_io;
733*080d01c4SChristoph Hellwig 	submit_bio(&chunk->bio);
734*080d01c4SChristoph Hellwig }
735*080d01c4SChristoph Hellwig 
736*080d01c4SChristoph Hellwig static struct xfs_gc_bio *
737*080d01c4SChristoph Hellwig xfs_zone_gc_split_write(
738*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data,
739*080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk)
740*080d01c4SChristoph Hellwig {
741*080d01c4SChristoph Hellwig 	struct queue_limits	*lim =
742*080d01c4SChristoph Hellwig 		&bdev_get_queue(chunk->bio.bi_bdev)->limits;
743*080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*split_chunk;
744*080d01c4SChristoph Hellwig 	int			split_sectors;
745*080d01c4SChristoph Hellwig 	unsigned int		split_len;
746*080d01c4SChristoph Hellwig 	struct bio		*split;
747*080d01c4SChristoph Hellwig 	unsigned int		nsegs;
748*080d01c4SChristoph Hellwig 
749*080d01c4SChristoph Hellwig 	if (!chunk->is_seq)
750*080d01c4SChristoph Hellwig 		return NULL;
751*080d01c4SChristoph Hellwig 
752*080d01c4SChristoph Hellwig 	split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
753*080d01c4SChristoph Hellwig 			lim->max_zone_append_sectors << SECTOR_SHIFT);
754*080d01c4SChristoph Hellwig 	if (!split_sectors)
755*080d01c4SChristoph Hellwig 		return NULL;
756*080d01c4SChristoph Hellwig 
757*080d01c4SChristoph Hellwig 	/* ensure the split chunk is still block size aligned */
758*080d01c4SChristoph Hellwig 	split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
759*080d01c4SChristoph Hellwig 			data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
760*080d01c4SChristoph Hellwig 	split_len = split_sectors << SECTOR_SHIFT;
761*080d01c4SChristoph Hellwig 
762*080d01c4SChristoph Hellwig 	split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
763*080d01c4SChristoph Hellwig 	split_chunk = container_of(split, struct xfs_gc_bio, bio);
764*080d01c4SChristoph Hellwig 	split_chunk->data = data;
765*080d01c4SChristoph Hellwig 	ihold(VFS_I(chunk->ip));
766*080d01c4SChristoph Hellwig 	split_chunk->ip = chunk->ip;
767*080d01c4SChristoph Hellwig 	split_chunk->is_seq = chunk->is_seq;
768*080d01c4SChristoph Hellwig 	split_chunk->scratch = chunk->scratch;
769*080d01c4SChristoph Hellwig 	split_chunk->offset = chunk->offset;
770*080d01c4SChristoph Hellwig 	split_chunk->len = split_len;
771*080d01c4SChristoph Hellwig 	split_chunk->old_startblock = chunk->old_startblock;
772*080d01c4SChristoph Hellwig 	split_chunk->new_daddr = chunk->new_daddr;
773*080d01c4SChristoph Hellwig 	split_chunk->oz = chunk->oz;
774*080d01c4SChristoph Hellwig 	atomic_inc(&chunk->oz->oz_ref);
775*080d01c4SChristoph Hellwig 
776*080d01c4SChristoph Hellwig 	chunk->offset += split_len;
777*080d01c4SChristoph Hellwig 	chunk->len -= split_len;
778*080d01c4SChristoph Hellwig 	chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
779*080d01c4SChristoph Hellwig 
780*080d01c4SChristoph Hellwig 	/* add right before the original chunk */
781*080d01c4SChristoph Hellwig 	WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
782*080d01c4SChristoph Hellwig 	list_add_tail(&split_chunk->entry, &chunk->entry);
783*080d01c4SChristoph Hellwig 	return split_chunk;
784*080d01c4SChristoph Hellwig }
785*080d01c4SChristoph Hellwig 
786*080d01c4SChristoph Hellwig static void
787*080d01c4SChristoph Hellwig xfs_zone_gc_write_chunk(
788*080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk)
789*080d01c4SChristoph Hellwig {
790*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data = chunk->data;
791*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp = chunk->ip->i_mount;
792*080d01c4SChristoph Hellwig 	unsigned int		folio_offset = chunk->bio.bi_io_vec->bv_offset;
793*080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*split_chunk;
794*080d01c4SChristoph Hellwig 
795*080d01c4SChristoph Hellwig 	if (chunk->bio.bi_status)
796*080d01c4SChristoph Hellwig 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
797*080d01c4SChristoph Hellwig 	if (xfs_is_shutdown(mp)) {
798*080d01c4SChristoph Hellwig 		xfs_zone_gc_free_chunk(chunk);
799*080d01c4SChristoph Hellwig 		return;
800*080d01c4SChristoph Hellwig 	}
801*080d01c4SChristoph Hellwig 
802*080d01c4SChristoph Hellwig 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
803*080d01c4SChristoph Hellwig 	list_move_tail(&chunk->entry, &data->writing);
804*080d01c4SChristoph Hellwig 
805*080d01c4SChristoph Hellwig 	bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
806*080d01c4SChristoph Hellwig 	bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
807*080d01c4SChristoph Hellwig 			folio_offset);
808*080d01c4SChristoph Hellwig 
809*080d01c4SChristoph Hellwig 	while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
810*080d01c4SChristoph Hellwig 		xfs_zone_gc_submit_write(data, split_chunk);
811*080d01c4SChristoph Hellwig 	xfs_zone_gc_submit_write(data, chunk);
812*080d01c4SChristoph Hellwig }
813*080d01c4SChristoph Hellwig 
814*080d01c4SChristoph Hellwig static void
815*080d01c4SChristoph Hellwig xfs_zone_gc_finish_chunk(
816*080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk)
817*080d01c4SChristoph Hellwig {
818*080d01c4SChristoph Hellwig 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
819*080d01c4SChristoph Hellwig 	struct xfs_inode	*ip = chunk->ip;
820*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp = ip->i_mount;
821*080d01c4SChristoph Hellwig 	int			error;
822*080d01c4SChristoph Hellwig 
823*080d01c4SChristoph Hellwig 	if (chunk->bio.bi_status)
824*080d01c4SChristoph Hellwig 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
825*080d01c4SChristoph Hellwig 	if (xfs_is_shutdown(mp)) {
826*080d01c4SChristoph Hellwig 		xfs_zone_gc_free_chunk(chunk);
827*080d01c4SChristoph Hellwig 		return;
828*080d01c4SChristoph Hellwig 	}
829*080d01c4SChristoph Hellwig 
830*080d01c4SChristoph Hellwig 	chunk->scratch->freed += chunk->len;
831*080d01c4SChristoph Hellwig 	if (chunk->scratch->freed == chunk->scratch->offset) {
832*080d01c4SChristoph Hellwig 		chunk->scratch->offset = 0;
833*080d01c4SChristoph Hellwig 		chunk->scratch->freed = 0;
834*080d01c4SChristoph Hellwig 	}
835*080d01c4SChristoph Hellwig 
836*080d01c4SChristoph Hellwig 	/*
837*080d01c4SChristoph Hellwig 	 * Cycle through the iolock and wait for direct I/O and layouts to
838*080d01c4SChristoph Hellwig 	 * ensure no one is reading from the old mapping before it goes away.
839*080d01c4SChristoph Hellwig 	 *
840*080d01c4SChristoph Hellwig 	 * Note that xfs_zoned_end_io() below checks that no other writer raced
841*080d01c4SChristoph Hellwig 	 * with us to update the mapping by checking that the old startblock
842*080d01c4SChristoph Hellwig 	 * didn't change.
843*080d01c4SChristoph Hellwig 	 */
844*080d01c4SChristoph Hellwig 	xfs_ilock(ip, iolock);
845*080d01c4SChristoph Hellwig 	error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
846*080d01c4SChristoph Hellwig 	if (!error)
847*080d01c4SChristoph Hellwig 		inode_dio_wait(VFS_I(ip));
848*080d01c4SChristoph Hellwig 	xfs_iunlock(ip, iolock);
849*080d01c4SChristoph Hellwig 	if (error)
850*080d01c4SChristoph Hellwig 		goto free;
851*080d01c4SChristoph Hellwig 
852*080d01c4SChristoph Hellwig 	if (chunk->is_seq)
853*080d01c4SChristoph Hellwig 		chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
854*080d01c4SChristoph Hellwig 	error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
855*080d01c4SChristoph Hellwig 			chunk->new_daddr, chunk->oz, chunk->old_startblock);
856*080d01c4SChristoph Hellwig free:
857*080d01c4SChristoph Hellwig 	if (error)
858*080d01c4SChristoph Hellwig 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
859*080d01c4SChristoph Hellwig 	xfs_zone_gc_free_chunk(chunk);
860*080d01c4SChristoph Hellwig }
861*080d01c4SChristoph Hellwig 
862*080d01c4SChristoph Hellwig static void
863*080d01c4SChristoph Hellwig xfs_zone_gc_finish_reset(
864*080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk)
865*080d01c4SChristoph Hellwig {
866*080d01c4SChristoph Hellwig 	struct xfs_rtgroup	*rtg = chunk->bio.bi_private;
867*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp = rtg_mount(rtg);
868*080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi = mp->m_zone_info;
869*080d01c4SChristoph Hellwig 
870*080d01c4SChristoph Hellwig 	if (chunk->bio.bi_status) {
871*080d01c4SChristoph Hellwig 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
872*080d01c4SChristoph Hellwig 		goto out;
873*080d01c4SChristoph Hellwig 	}
874*080d01c4SChristoph Hellwig 
875*080d01c4SChristoph Hellwig 	xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
876*080d01c4SChristoph Hellwig 	atomic_inc(&zi->zi_nr_free_zones);
877*080d01c4SChristoph Hellwig 
878*080d01c4SChristoph Hellwig 	xfs_zoned_add_available(mp, rtg_blocks(rtg));
879*080d01c4SChristoph Hellwig 
880*080d01c4SChristoph Hellwig 	wake_up_all(&zi->zi_zone_wait);
881*080d01c4SChristoph Hellwig out:
882*080d01c4SChristoph Hellwig 	list_del(&chunk->entry);
883*080d01c4SChristoph Hellwig 	bio_put(&chunk->bio);
884*080d01c4SChristoph Hellwig }
885*080d01c4SChristoph Hellwig 
886*080d01c4SChristoph Hellwig static bool
887*080d01c4SChristoph Hellwig xfs_zone_gc_prepare_reset(
888*080d01c4SChristoph Hellwig 	struct bio		*bio,
889*080d01c4SChristoph Hellwig 	struct xfs_rtgroup	*rtg)
890*080d01c4SChristoph Hellwig {
891*080d01c4SChristoph Hellwig 	trace_xfs_zone_reset(rtg);
892*080d01c4SChristoph Hellwig 
893*080d01c4SChristoph Hellwig 	ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
894*080d01c4SChristoph Hellwig 	bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
895*080d01c4SChristoph Hellwig 	if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
896*080d01c4SChristoph Hellwig 		if (!bdev_max_discard_sectors(bio->bi_bdev))
897*080d01c4SChristoph Hellwig 			return false;
898*080d01c4SChristoph Hellwig 		bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
899*080d01c4SChristoph Hellwig 		bio->bi_iter.bi_size =
900*080d01c4SChristoph Hellwig 			XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
901*080d01c4SChristoph Hellwig 	}
902*080d01c4SChristoph Hellwig 
903*080d01c4SChristoph Hellwig 	return true;
904*080d01c4SChristoph Hellwig }
905*080d01c4SChristoph Hellwig 
906*080d01c4SChristoph Hellwig int
907*080d01c4SChristoph Hellwig xfs_zone_gc_reset_sync(
908*080d01c4SChristoph Hellwig 	struct xfs_rtgroup	*rtg)
909*080d01c4SChristoph Hellwig {
910*080d01c4SChristoph Hellwig 	int			error = 0;
911*080d01c4SChristoph Hellwig 	struct bio		bio;
912*080d01c4SChristoph Hellwig 
913*080d01c4SChristoph Hellwig 	bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
914*080d01c4SChristoph Hellwig 			REQ_OP_ZONE_RESET);
915*080d01c4SChristoph Hellwig 	if (xfs_zone_gc_prepare_reset(&bio, rtg))
916*080d01c4SChristoph Hellwig 		error = submit_bio_wait(&bio);
917*080d01c4SChristoph Hellwig 	bio_uninit(&bio);
918*080d01c4SChristoph Hellwig 
919*080d01c4SChristoph Hellwig 	return error;
920*080d01c4SChristoph Hellwig }
921*080d01c4SChristoph Hellwig 
922*080d01c4SChristoph Hellwig static void
923*080d01c4SChristoph Hellwig xfs_zone_gc_reset_zones(
924*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data,
925*080d01c4SChristoph Hellwig 	struct xfs_group	*reset_list)
926*080d01c4SChristoph Hellwig {
927*080d01c4SChristoph Hellwig 	struct xfs_group	*next = reset_list;
928*080d01c4SChristoph Hellwig 
929*080d01c4SChristoph Hellwig 	if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
930*080d01c4SChristoph Hellwig 		xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
931*080d01c4SChristoph Hellwig 		return;
932*080d01c4SChristoph Hellwig 	}
933*080d01c4SChristoph Hellwig 
934*080d01c4SChristoph Hellwig 	do {
935*080d01c4SChristoph Hellwig 		struct xfs_rtgroup	*rtg = to_rtg(next);
936*080d01c4SChristoph Hellwig 		struct xfs_gc_bio	*chunk;
937*080d01c4SChristoph Hellwig 		struct bio		*bio;
938*080d01c4SChristoph Hellwig 
939*080d01c4SChristoph Hellwig 		xfs_log_force_inode(rtg_rmap(rtg));
940*080d01c4SChristoph Hellwig 
941*080d01c4SChristoph Hellwig 		next = rtg_group(rtg)->xg_next_reset;
942*080d01c4SChristoph Hellwig 		rtg_group(rtg)->xg_next_reset = NULL;
943*080d01c4SChristoph Hellwig 
944*080d01c4SChristoph Hellwig 		bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
945*080d01c4SChristoph Hellwig 				0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
946*080d01c4SChristoph Hellwig 		bio->bi_private = rtg;
947*080d01c4SChristoph Hellwig 		bio->bi_end_io = xfs_zone_gc_end_io;
948*080d01c4SChristoph Hellwig 
949*080d01c4SChristoph Hellwig 		chunk = container_of(bio, struct xfs_gc_bio, bio);
950*080d01c4SChristoph Hellwig 		chunk->data = data;
951*080d01c4SChristoph Hellwig 		WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
952*080d01c4SChristoph Hellwig 		list_add_tail(&chunk->entry, &data->resetting);
953*080d01c4SChristoph Hellwig 
954*080d01c4SChristoph Hellwig 		/*
955*080d01c4SChristoph Hellwig 		 * Also use the bio to drive the state machine when neither
956*080d01c4SChristoph Hellwig 		 * zone reset nor discard is supported to keep things simple.
957*080d01c4SChristoph Hellwig 		 */
958*080d01c4SChristoph Hellwig 		if (xfs_zone_gc_prepare_reset(bio, rtg))
959*080d01c4SChristoph Hellwig 			submit_bio(bio);
960*080d01c4SChristoph Hellwig 		else
961*080d01c4SChristoph Hellwig 			bio_endio(bio);
962*080d01c4SChristoph Hellwig 	} while (next);
963*080d01c4SChristoph Hellwig }
964*080d01c4SChristoph Hellwig 
965*080d01c4SChristoph Hellwig /*
966*080d01c4SChristoph Hellwig  * Handle the work to read and write data for GC and to reset the zones,
967*080d01c4SChristoph Hellwig  * including handling all completions.
968*080d01c4SChristoph Hellwig  *
969*080d01c4SChristoph Hellwig  * Note that the order of the chunks is preserved so that we don't undo the
970*080d01c4SChristoph Hellwig  * optimal order established by xfs_zone_gc_query().
971*080d01c4SChristoph Hellwig  */
972*080d01c4SChristoph Hellwig static bool
973*080d01c4SChristoph Hellwig xfs_zone_gc_handle_work(
974*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data)
975*080d01c4SChristoph Hellwig {
976*080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi = data->mp->m_zone_info;
977*080d01c4SChristoph Hellwig 	struct xfs_gc_bio	*chunk, *next;
978*080d01c4SChristoph Hellwig 	struct xfs_group	*reset_list;
979*080d01c4SChristoph Hellwig 	struct blk_plug		plug;
980*080d01c4SChristoph Hellwig 
981*080d01c4SChristoph Hellwig 	spin_lock(&zi->zi_reset_list_lock);
982*080d01c4SChristoph Hellwig 	reset_list = zi->zi_reset_list;
983*080d01c4SChristoph Hellwig 	zi->zi_reset_list = NULL;
984*080d01c4SChristoph Hellwig 	spin_unlock(&zi->zi_reset_list_lock);
985*080d01c4SChristoph Hellwig 
986*080d01c4SChristoph Hellwig 	if (!xfs_zone_gc_select_victim(data) ||
987*080d01c4SChristoph Hellwig 	    !xfs_zone_gc_space_available(data)) {
988*080d01c4SChristoph Hellwig 		if (list_empty(&data->reading) &&
989*080d01c4SChristoph Hellwig 		    list_empty(&data->writing) &&
990*080d01c4SChristoph Hellwig 		    list_empty(&data->resetting) &&
991*080d01c4SChristoph Hellwig 		    !reset_list)
992*080d01c4SChristoph Hellwig 			return false;
993*080d01c4SChristoph Hellwig 	}
994*080d01c4SChristoph Hellwig 
995*080d01c4SChristoph Hellwig 	__set_current_state(TASK_RUNNING);
996*080d01c4SChristoph Hellwig 	try_to_freeze();
997*080d01c4SChristoph Hellwig 
998*080d01c4SChristoph Hellwig 	if (reset_list)
999*080d01c4SChristoph Hellwig 		xfs_zone_gc_reset_zones(data, reset_list);
1000*080d01c4SChristoph Hellwig 
1001*080d01c4SChristoph Hellwig 	list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
1002*080d01c4SChristoph Hellwig 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1003*080d01c4SChristoph Hellwig 			break;
1004*080d01c4SChristoph Hellwig 		xfs_zone_gc_finish_reset(chunk);
1005*080d01c4SChristoph Hellwig 	}
1006*080d01c4SChristoph Hellwig 
1007*080d01c4SChristoph Hellwig 	list_for_each_entry_safe(chunk, next, &data->writing, entry) {
1008*080d01c4SChristoph Hellwig 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1009*080d01c4SChristoph Hellwig 			break;
1010*080d01c4SChristoph Hellwig 		xfs_zone_gc_finish_chunk(chunk);
1011*080d01c4SChristoph Hellwig 	}
1012*080d01c4SChristoph Hellwig 
1013*080d01c4SChristoph Hellwig 	blk_start_plug(&plug);
1014*080d01c4SChristoph Hellwig 	list_for_each_entry_safe(chunk, next, &data->reading, entry) {
1015*080d01c4SChristoph Hellwig 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1016*080d01c4SChristoph Hellwig 			break;
1017*080d01c4SChristoph Hellwig 		xfs_zone_gc_write_chunk(chunk);
1018*080d01c4SChristoph Hellwig 	}
1019*080d01c4SChristoph Hellwig 	blk_finish_plug(&plug);
1020*080d01c4SChristoph Hellwig 
1021*080d01c4SChristoph Hellwig 	blk_start_plug(&plug);
1022*080d01c4SChristoph Hellwig 	while (xfs_zone_gc_start_chunk(data))
1023*080d01c4SChristoph Hellwig 		;
1024*080d01c4SChristoph Hellwig 	blk_finish_plug(&plug);
1025*080d01c4SChristoph Hellwig 	return true;
1026*080d01c4SChristoph Hellwig }
1027*080d01c4SChristoph Hellwig 
1028*080d01c4SChristoph Hellwig /*
1029*080d01c4SChristoph Hellwig  * Note that the current GC algorithm would break reflinks and thus duplicate
1030*080d01c4SChristoph Hellwig  * data that was shared by multiple owners before.  Because of that reflinks
1031*080d01c4SChristoph Hellwig  * are currently not supported on zoned file systems and can't be created or
1032*080d01c4SChristoph Hellwig  * mounted.
1033*080d01c4SChristoph Hellwig  */
1034*080d01c4SChristoph Hellwig static int
1035*080d01c4SChristoph Hellwig xfs_zoned_gcd(
1036*080d01c4SChristoph Hellwig 	void			*private)
1037*080d01c4SChristoph Hellwig {
1038*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data = private;
1039*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp = data->mp;
1040*080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi = mp->m_zone_info;
1041*080d01c4SChristoph Hellwig 	unsigned int		nofs_flag;
1042*080d01c4SChristoph Hellwig 
1043*080d01c4SChristoph Hellwig 	nofs_flag = memalloc_nofs_save();
1044*080d01c4SChristoph Hellwig 	set_freezable();
1045*080d01c4SChristoph Hellwig 
1046*080d01c4SChristoph Hellwig 	for (;;) {
1047*080d01c4SChristoph Hellwig 		set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1048*080d01c4SChristoph Hellwig 		xfs_set_zonegc_running(mp);
1049*080d01c4SChristoph Hellwig 		if (xfs_zone_gc_handle_work(data))
1050*080d01c4SChristoph Hellwig 			continue;
1051*080d01c4SChristoph Hellwig 
1052*080d01c4SChristoph Hellwig 		if (list_empty(&data->reading) &&
1053*080d01c4SChristoph Hellwig 		    list_empty(&data->writing) &&
1054*080d01c4SChristoph Hellwig 		    list_empty(&data->resetting) &&
1055*080d01c4SChristoph Hellwig 		    !zi->zi_reset_list) {
1056*080d01c4SChristoph Hellwig 			xfs_clear_zonegc_running(mp);
1057*080d01c4SChristoph Hellwig 			xfs_zoned_resv_wake_all(mp);
1058*080d01c4SChristoph Hellwig 
1059*080d01c4SChristoph Hellwig 			if (kthread_should_stop()) {
1060*080d01c4SChristoph Hellwig 				__set_current_state(TASK_RUNNING);
1061*080d01c4SChristoph Hellwig 				break;
1062*080d01c4SChristoph Hellwig 			}
1063*080d01c4SChristoph Hellwig 
1064*080d01c4SChristoph Hellwig 			if (kthread_should_park()) {
1065*080d01c4SChristoph Hellwig 				__set_current_state(TASK_RUNNING);
1066*080d01c4SChristoph Hellwig 				kthread_parkme();
1067*080d01c4SChristoph Hellwig 				continue;
1068*080d01c4SChristoph Hellwig 			}
1069*080d01c4SChristoph Hellwig 		}
1070*080d01c4SChristoph Hellwig 
1071*080d01c4SChristoph Hellwig 		schedule();
1072*080d01c4SChristoph Hellwig 	}
1073*080d01c4SChristoph Hellwig 	xfs_clear_zonegc_running(mp);
1074*080d01c4SChristoph Hellwig 
1075*080d01c4SChristoph Hellwig 	if (data->iter.victim_rtg)
1076*080d01c4SChristoph Hellwig 		xfs_rtgroup_rele(data->iter.victim_rtg);
1077*080d01c4SChristoph Hellwig 
1078*080d01c4SChristoph Hellwig 	memalloc_nofs_restore(nofs_flag);
1079*080d01c4SChristoph Hellwig 	xfs_zone_gc_data_free(data);
1080*080d01c4SChristoph Hellwig 	return 0;
1081*080d01c4SChristoph Hellwig }
1082*080d01c4SChristoph Hellwig 
1083*080d01c4SChristoph Hellwig void
1084*080d01c4SChristoph Hellwig xfs_zone_gc_start(
1085*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp)
1086*080d01c4SChristoph Hellwig {
1087*080d01c4SChristoph Hellwig 	if (xfs_has_zoned(mp))
1088*080d01c4SChristoph Hellwig 		kthread_unpark(mp->m_zone_info->zi_gc_thread);
1089*080d01c4SChristoph Hellwig }
1090*080d01c4SChristoph Hellwig 
1091*080d01c4SChristoph Hellwig void
1092*080d01c4SChristoph Hellwig xfs_zone_gc_stop(
1093*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp)
1094*080d01c4SChristoph Hellwig {
1095*080d01c4SChristoph Hellwig 	if (xfs_has_zoned(mp))
1096*080d01c4SChristoph Hellwig 		kthread_park(mp->m_zone_info->zi_gc_thread);
1097*080d01c4SChristoph Hellwig }
1098*080d01c4SChristoph Hellwig 
1099*080d01c4SChristoph Hellwig int
1100*080d01c4SChristoph Hellwig xfs_zone_gc_mount(
1101*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp)
1102*080d01c4SChristoph Hellwig {
1103*080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi = mp->m_zone_info;
1104*080d01c4SChristoph Hellwig 	struct xfs_zone_gc_data	*data;
1105*080d01c4SChristoph Hellwig 	struct xfs_open_zone	*oz;
1106*080d01c4SChristoph Hellwig 	int			error;
1107*080d01c4SChristoph Hellwig 
1108*080d01c4SChristoph Hellwig 	/*
1109*080d01c4SChristoph Hellwig 	 * If there are no free zones available for GC, pick the open zone with
1110*080d01c4SChristoph Hellwig 	 * the least used space to GC into.  This should only happen after an
1111*080d01c4SChristoph Hellwig 	 * unclean shutdown near ENOSPC while GC was ongoing.
1112*080d01c4SChristoph Hellwig 	 *
1113*080d01c4SChristoph Hellwig 	 * We also need to do this for the first gc zone allocation if we
1114*080d01c4SChristoph Hellwig 	 * unmounted while at the open limit.
1115*080d01c4SChristoph Hellwig 	 */
1116*080d01c4SChristoph Hellwig 	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
1117*080d01c4SChristoph Hellwig 	    zi->zi_nr_open_zones == mp->m_max_open_zones)
1118*080d01c4SChristoph Hellwig 		oz = xfs_zone_gc_steal_open(zi);
1119*080d01c4SChristoph Hellwig 	else
1120*080d01c4SChristoph Hellwig 		oz = xfs_open_zone(mp, true);
1121*080d01c4SChristoph Hellwig 	if (!oz) {
1122*080d01c4SChristoph Hellwig 		xfs_warn(mp, "unable to allocate a zone for gc");
1123*080d01c4SChristoph Hellwig 		error = -EIO;
1124*080d01c4SChristoph Hellwig 		goto out;
1125*080d01c4SChristoph Hellwig 	}
1126*080d01c4SChristoph Hellwig 
1127*080d01c4SChristoph Hellwig 	trace_xfs_zone_gc_target_opened(oz->oz_rtg);
1128*080d01c4SChristoph Hellwig 	zi->zi_open_gc_zone = oz;
1129*080d01c4SChristoph Hellwig 
1130*080d01c4SChristoph Hellwig 	data = xfs_zone_gc_data_alloc(mp);
1131*080d01c4SChristoph Hellwig 	if (!data) {
1132*080d01c4SChristoph Hellwig 		error = -ENOMEM;
1133*080d01c4SChristoph Hellwig 		goto out_put_gc_zone;
1134*080d01c4SChristoph Hellwig 	}
1135*080d01c4SChristoph Hellwig 
1136*080d01c4SChristoph Hellwig 	mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
1137*080d01c4SChristoph Hellwig 			"xfs-zone-gc/%s", mp->m_super->s_id);
1138*080d01c4SChristoph Hellwig 	if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
1139*080d01c4SChristoph Hellwig 		xfs_warn(mp, "unable to create zone gc thread");
1140*080d01c4SChristoph Hellwig 		error = PTR_ERR(mp->m_zone_info->zi_gc_thread);
1141*080d01c4SChristoph Hellwig 		goto out_free_gc_data;
1142*080d01c4SChristoph Hellwig 	}
1143*080d01c4SChristoph Hellwig 
1144*080d01c4SChristoph Hellwig 	/* xfs_zone_gc_start will unpark for rw mounts */
1145*080d01c4SChristoph Hellwig 	kthread_park(mp->m_zone_info->zi_gc_thread);
1146*080d01c4SChristoph Hellwig 	return 0;
1147*080d01c4SChristoph Hellwig 
1148*080d01c4SChristoph Hellwig out_free_gc_data:
1149*080d01c4SChristoph Hellwig 	kfree(data);
1150*080d01c4SChristoph Hellwig out_put_gc_zone:
1151*080d01c4SChristoph Hellwig 	xfs_open_zone_put(zi->zi_open_gc_zone);
1152*080d01c4SChristoph Hellwig out:
1153*080d01c4SChristoph Hellwig 	return error;
1154*080d01c4SChristoph Hellwig }
1155*080d01c4SChristoph Hellwig 
1156*080d01c4SChristoph Hellwig void
1157*080d01c4SChristoph Hellwig xfs_zone_gc_unmount(
1158*080d01c4SChristoph Hellwig 	struct xfs_mount	*mp)
1159*080d01c4SChristoph Hellwig {
1160*080d01c4SChristoph Hellwig 	struct xfs_zone_info	*zi = mp->m_zone_info;
1161*080d01c4SChristoph Hellwig 
1162*080d01c4SChristoph Hellwig 	kthread_stop(zi->zi_gc_thread);
1163*080d01c4SChristoph Hellwig 	if (zi->zi_open_gc_zone)
1164*080d01c4SChristoph Hellwig 		xfs_open_zone_put(zi->zi_open_gc_zone);
1165*080d01c4SChristoph Hellwig }
1166