xref: /linux/fs/xfs/xfs_zone_gc.c (revision 9e4e86a604dfd06402933467578c4b79f5412b2c)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2023-2025 Christoph Hellwig.
4  * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5  */
6 #include "xfs_platform.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "xfs_trans.h"
15 #include "xfs_icache.h"
16 #include "xfs_rmap.h"
17 #include "xfs_rtbitmap.h"
18 #include "xfs_rtrmap_btree.h"
19 #include "xfs_errortag.h"
20 #include "xfs_error.h"
21 #include "xfs_zone_alloc.h"
22 #include "xfs_zone_priv.h"
23 #include "xfs_zones.h"
24 #include "xfs_trace.h"
25 
26 /*
27  * Implement Garbage Collection (GC) of partially used zoned.
28  *
29  * To support the purely sequential writes in each zone, zoned XFS needs to be
30  * able to move data remaining in a zone out of it to reset the zone to prepare
31  * for writing to it again.
32  *
33  * This is done by the GC thread implemented in this file.  To support that a
34  * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
35  * write the garbage collected data into.
36  *
37  * Whenever the available space is below the chosen threshold, the GC thread
38  * looks for potential non-empty but not fully used zones that are worth
39  * reclaiming.  Once found the rmap for the victim zone is queried, and after
40  * a bit of sorting to reduce fragmentation, the still live extents are read
41  * into memory and written to the GC target zone, and the bmap btree of the
42  * files is updated to point to the new location.  To avoid taking the IOLOCK
43  * and MMAPLOCK for the entire GC process and thus affecting the latency of
44  * user reads and writes to the files, the GC writes are speculative and the
45  * I/O completion checks that no other writes happened for the affected regions
46  * before remapping.
47  *
48  * Once a zone does not contain any valid data, be that through GC or user
49  * block removal, it is queued for for a zone reset.  The reset operation
50  * carefully ensures that the RT device cache is flushed and all transactions
51  * referencing the rmap have been committed to disk.
52  */
53 
54 /*
55  * Size of each GC scratch allocation, and the number of buffers.
56  */
57 #define XFS_GC_BUF_SIZE		SZ_1M
58 #define XFS_GC_NR_BUFS		2
59 static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS);
60 
61 /*
62  * Chunk that is read and written for each GC operation.
63  *
64  * Note that for writes to actual zoned devices, the chunk can be split when
65  * reaching the hardware limit.
66  */
67 struct xfs_gc_bio {
68 	struct xfs_zone_gc_data		*data;
69 
70 	/*
71 	 * Entry into the reading/writing/resetting list.  Only accessed from
72 	 * the GC thread, so no locking needed.
73 	 */
74 	struct list_head		entry;
75 
76 	/*
77 	 * State of this gc_bio.  Done means the current I/O completed.
78 	 * Set from the bio end I/O handler, read from the GC thread.
79 	 */
80 	enum {
81 		XFS_GC_BIO_NEW,
82 		XFS_GC_BIO_DONE,
83 	} state;
84 
85 	/*
86 	 * Pointer to the inode and byte range in the inode that this
87 	 * GC chunk is operating on.
88 	 */
89 	struct xfs_inode		*ip;
90 	loff_t				offset;
91 	unsigned int			len;
92 
93 	/*
94 	 * Existing startblock (in the zone to be freed) and newly assigned
95 	 * daddr in the zone GCed into.
96 	 */
97 	xfs_fsblock_t			old_startblock;
98 	xfs_daddr_t			new_daddr;
99 
100 	/* Are we writing to a sequential write required zone? */
101 	bool				is_seq;
102 
103 	/* Open Zone being written to */
104 	struct xfs_open_zone		*oz;
105 
106 	struct xfs_rtgroup		*victim_rtg;
107 
108 	/* Bio used for reads and writes, including the bvec used by it */
109 	struct bio			bio;	/* must be last */
110 };
111 
112 #define XFS_ZONE_GC_RECS		1024
113 
114 /* iterator, needs to be reinitialized for each victim zone */
115 struct xfs_zone_gc_iter {
116 	struct xfs_rtgroup		*victim_rtg;
117 	unsigned int			rec_count;
118 	unsigned int			rec_idx;
119 	xfs_agblock_t			next_startblock;
120 	struct xfs_rmap_irec		*recs;
121 };
122 
123 /*
124  * Per-mount GC state.
125  */
126 struct xfs_zone_gc_data {
127 	struct xfs_mount		*mp;
128 	struct xfs_open_zone		*oz;
129 
130 	/* bioset used to allocate the gc_bios */
131 	struct bio_set			bio_set;
132 
133 	/*
134 	 * Scratchpad to buffer GC data, organized as a ring buffer over
135 	 * discontiguous folios.  scratch_head is where the buffer is filled,
136 	 * scratch_tail tracks the buffer space freed, and scratch_available
137 	 * counts the space available in the ring buffer between the head and
138 	 * the tail.
139 	 */
140 	struct folio			*scratch_folios[XFS_GC_NR_BUFS];
141 	unsigned int			scratch_size;
142 	unsigned int			scratch_available;
143 	unsigned int			scratch_head;
144 	unsigned int			scratch_tail;
145 
146 	/*
147 	 * List of bios currently being read, written and reset.
148 	 * These lists are only accessed by the GC thread itself, and must only
149 	 * be processed in order.
150 	 */
151 	struct list_head		reading;
152 	struct list_head		writing;
153 	struct list_head		resetting;
154 
155 	/*
156 	 * Iterator for the victim zone.
157 	 */
158 	struct xfs_zone_gc_iter		iter;
159 };
160 
161 /*
162  * We aim to keep enough zones free in stock to fully use the open zone limit
163  * for data placement purposes. Additionally, the m_zonegc_low_space tunable
164  * can be set to make sure a fraction of the unused blocks are available for
165  * writing.
166  */
167 bool
xfs_zoned_need_gc(struct xfs_mount * mp)168 xfs_zoned_need_gc(
169 	struct xfs_mount	*mp)
170 {
171 	s64			available, free, threshold;
172 	s32			remainder;
173 
174 	/* If we have no reclaimable blocks, running GC is useless. */
175 	if (!xfs_zoned_have_reclaimable(mp->m_zone_info))
176 		return false;
177 
178 	/*
179 	 * In order to avoid file fragmentation as much as possible, we should
180 	 * make sure that we can open enough zones. So trigger GC if the number
181 	 * of blocks immediately available for writes is lower than the total
182 	 * number of blocks from all possible open zones.
183 	 */
184 	available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
185 	if (available <
186 	    xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
187 		return true;
188 
189 	/*
190 	 * For cases where the user wants to be more aggressive with GC,
191 	 * the sysfs attribute zonegc_low_space may be set to a non zero value,
192 	 * to indicate that GC should try to maintain at least zonegc_low_space
193 	 * percent of the free space to be directly available for writing. Check
194 	 * this here.
195 	 */
196 	if (!mp->m_zonegc_low_space)
197 		return false;
198 
199 	free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
200 	threshold = div_s64_rem(free, 100, &remainder);
201 	threshold = threshold * mp->m_zonegc_low_space +
202 		    remainder * div_s64(mp->m_zonegc_low_space, 100);
203 
204 	return available < threshold;
205 }
206 
207 static struct xfs_zone_gc_data *
xfs_zone_gc_data_alloc(struct xfs_mount * mp)208 xfs_zone_gc_data_alloc(
209 	struct xfs_mount	*mp)
210 {
211 	struct xfs_zone_gc_data	*data;
212 	int			i;
213 
214 	data = kzalloc_obj(*data);
215 	if (!data)
216 		return NULL;
217 	data->iter.recs = kzalloc_objs(*data->iter.recs, XFS_ZONE_GC_RECS);
218 	if (!data->iter.recs)
219 		goto out_free_data;
220 
221 	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
222 			BIOSET_NEED_BVECS))
223 		goto out_free_recs;
224 	for (i = 0; i < XFS_GC_NR_BUFS; i++) {
225 		data->scratch_folios[i] =
226 			folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE));
227 		if (!data->scratch_folios[i])
228 			goto out_free_scratch;
229 	}
230 	data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS;
231 	data->scratch_available = data->scratch_size;
232 	INIT_LIST_HEAD(&data->reading);
233 	INIT_LIST_HEAD(&data->writing);
234 	INIT_LIST_HEAD(&data->resetting);
235 	data->mp = mp;
236 	return data;
237 
238 out_free_scratch:
239 	while (--i >= 0)
240 		folio_put(data->scratch_folios[i]);
241 	bioset_exit(&data->bio_set);
242 out_free_recs:
243 	kfree(data->iter.recs);
244 out_free_data:
245 	kfree(data);
246 	return NULL;
247 }
248 
249 static void
xfs_zone_gc_data_free(struct xfs_zone_gc_data * data)250 xfs_zone_gc_data_free(
251 	struct xfs_zone_gc_data	*data)
252 {
253 	int			i;
254 
255 	for (i = 0; i < XFS_GC_NR_BUFS; i++)
256 		folio_put(data->scratch_folios[i]);
257 	bioset_exit(&data->bio_set);
258 	kfree(data->iter.recs);
259 	kfree(data);
260 }
261 
262 static void
xfs_zone_gc_iter_init(struct xfs_zone_gc_iter * iter,struct xfs_rtgroup * victim_rtg)263 xfs_zone_gc_iter_init(
264 	struct xfs_zone_gc_iter	*iter,
265 	struct xfs_rtgroup	*victim_rtg)
266 
267 {
268 	iter->next_startblock = 0;
269 	iter->rec_count = 0;
270 	iter->rec_idx = 0;
271 	iter->victim_rtg = victim_rtg;
272 	atomic_inc(&victim_rtg->rtg_gccount);
273 }
274 
275 /*
276  * Query the rmap of the victim zone to gather the records to evacuate.
277  */
278 static int
xfs_zone_gc_query_cb(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * irec,void * private)279 xfs_zone_gc_query_cb(
280 	struct xfs_btree_cur	*cur,
281 	const struct xfs_rmap_irec *irec,
282 	void			*private)
283 {
284 	struct xfs_zone_gc_iter	*iter = private;
285 
286 	ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
287 	ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
288 	ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
289 
290 	iter->recs[iter->rec_count] = *irec;
291 	if (++iter->rec_count == XFS_ZONE_GC_RECS) {
292 		iter->next_startblock =
293 			irec->rm_startblock + irec->rm_blockcount;
294 		return 1;
295 	}
296 	return 0;
297 }
298 
299 static int
xfs_zone_gc_rmap_rec_cmp(const void * a,const void * b)300 xfs_zone_gc_rmap_rec_cmp(
301 	const void			*a,
302 	const void			*b)
303 {
304 	const struct xfs_rmap_irec	*reca = a;
305 	const struct xfs_rmap_irec	*recb = b;
306 	int				diff;
307 
308 	diff = cmp_int(reca->rm_owner, recb->rm_owner);
309 	if (diff)
310 		return diff;
311 	return cmp_int(reca->rm_offset, recb->rm_offset);
312 }
313 
314 static int
xfs_zone_gc_query(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter)315 xfs_zone_gc_query(
316 	struct xfs_mount	*mp,
317 	struct xfs_zone_gc_iter	*iter)
318 {
319 	struct xfs_rtgroup	*rtg = iter->victim_rtg;
320 	struct xfs_rmap_irec	ri_low = { };
321 	struct xfs_rmap_irec	ri_high;
322 	struct xfs_btree_cur	*cur;
323 	struct xfs_trans	*tp;
324 	int			error;
325 
326 	ASSERT(iter->next_startblock <= rtg_blocks(rtg));
327 	if (iter->next_startblock == rtg_blocks(rtg))
328 		goto done;
329 
330 	ASSERT(iter->next_startblock < rtg_blocks(rtg));
331 	ri_low.rm_startblock = iter->next_startblock;
332 	memset(&ri_high, 0xFF, sizeof(ri_high));
333 
334 	iter->rec_idx = 0;
335 	iter->rec_count = 0;
336 
337 	tp = xfs_trans_alloc_empty(mp);
338 	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
339 	cur = xfs_rtrmapbt_init_cursor(tp, rtg);
340 	error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
341 			xfs_zone_gc_query_cb, iter);
342 	xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
343 	xfs_btree_del_cursor(cur, error < 0 ? error : 0);
344 	xfs_trans_cancel(tp);
345 
346 	if (error < 0)
347 		return error;
348 
349 	/*
350 	 * Sort the rmap records by inode number and increasing offset to
351 	 * defragment the mappings.
352 	 *
353 	 * This could be further enhanced by an even bigger look ahead window,
354 	 * but that's better left until we have better detection of changes to
355 	 * inode mapping to avoid the potential of GCing already dead data.
356 	 */
357 	sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
358 			xfs_zone_gc_rmap_rec_cmp, NULL);
359 
360 	if (error == 0) {
361 		/*
362 		 * We finished iterating through the zone.
363 		 */
364 		iter->next_startblock = rtg_blocks(rtg);
365 		if (iter->rec_count == 0)
366 			goto done;
367 	}
368 
369 	return 0;
370 done:
371 	atomic_dec(&iter->victim_rtg->rtg_gccount);
372 	xfs_rtgroup_rele(iter->victim_rtg);
373 	iter->victim_rtg = NULL;
374 	return 0;
375 }
376 
377 static bool
xfs_zone_gc_iter_irec(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter,struct xfs_rmap_irec * chunk_rec,struct xfs_inode ** ipp)378 xfs_zone_gc_iter_irec(
379 	struct xfs_mount	*mp,
380 	struct xfs_zone_gc_iter	*iter,
381 	struct xfs_rmap_irec	*chunk_rec,
382 	struct xfs_inode	**ipp)
383 {
384 	struct xfs_rmap_irec	*irec;
385 	int			error;
386 
387 retry:
388 	if (iter->rec_idx == iter->rec_count) {
389 		error = xfs_zone_gc_query(mp, iter);
390 		if (error)
391 			goto fail;
392 		if (!iter->victim_rtg)
393 			return false;
394 	}
395 
396 	irec = &iter->recs[iter->rec_idx];
397 	error = xfs_iget(mp, NULL, irec->rm_owner,
398 			XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
399 	if (error) {
400 		/*
401 		 * If the inode was already deleted, skip over it.
402 		 */
403 		if (error == -ENOENT) {
404 			iter->rec_idx++;
405 			goto retry;
406 		}
407 		goto fail;
408 	}
409 
410 	if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
411 		iter->rec_idx++;
412 		xfs_irele(*ipp);
413 		goto retry;
414 	}
415 
416 	*chunk_rec = *irec;
417 	return true;
418 
419 fail:
420 	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
421 	return false;
422 }
423 
424 static void
xfs_zone_gc_iter_advance(struct xfs_zone_gc_iter * iter,xfs_extlen_t count_fsb)425 xfs_zone_gc_iter_advance(
426 	struct xfs_zone_gc_iter	*iter,
427 	xfs_extlen_t		count_fsb)
428 {
429 	struct xfs_rmap_irec	*irec = &iter->recs[iter->rec_idx];
430 
431 	irec->rm_offset += count_fsb;
432 	irec->rm_startblock += count_fsb;
433 	irec->rm_blockcount -= count_fsb;
434 	if (!irec->rm_blockcount)
435 		iter->rec_idx++;
436 }
437 
438 static struct xfs_rtgroup *
xfs_zone_gc_pick_victim_from(struct xfs_mount * mp,uint32_t bucket)439 xfs_zone_gc_pick_victim_from(
440 	struct xfs_mount	*mp,
441 	uint32_t		bucket)
442 {
443 	struct xfs_zone_info	*zi = mp->m_zone_info;
444 	uint32_t		victim_used = U32_MAX;
445 	struct xfs_rtgroup	*victim_rtg = NULL;
446 	uint32_t		bit;
447 
448 	if (!zi->zi_used_bucket_entries[bucket])
449 		return NULL;
450 
451 	for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
452 			mp->m_sb.sb_rgcount) {
453 		struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
454 
455 		if (!rtg)
456 			continue;
457 
458 		/*
459 		 * If the zone is already undergoing GC, don't pick it again.
460 		 *
461 		 * This prevents us from picking one of the zones for which we
462 		 * already submitted GC I/O, but for which the remapping hasn't
463 		 * concluded yet.  This won't cause data corruption, but
464 		 * increases write amplification and slows down GC, so this is
465 		 * a bad thing.
466 		 */
467 		if (atomic_read(&rtg->rtg_gccount)) {
468 			xfs_rtgroup_rele(rtg);
469 			continue;
470 		}
471 
472 		/* skip zones that are just waiting for a reset */
473 		if (rtg_rmap(rtg)->i_used_blocks == 0 ||
474 		    rtg_rmap(rtg)->i_used_blocks >= victim_used) {
475 			xfs_rtgroup_rele(rtg);
476 			continue;
477 		}
478 
479 		if (victim_rtg)
480 			xfs_rtgroup_rele(victim_rtg);
481 		victim_rtg = rtg;
482 		victim_used = rtg_rmap(rtg)->i_used_blocks;
483 
484 		/*
485 		 * Any zone that is less than 1 percent used is fair game for
486 		 * instant reclaim. All of these zones are in the last
487 		 * bucket, so avoid the expensive division for the zones
488 		 * in the other buckets.
489 		 */
490 		if (bucket == 0 &&
491 		    rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
492 			break;
493 	}
494 
495 	return victim_rtg;
496 }
497 
498 /*
499  * Iterate through all zones marked as reclaimable and find a candidate to
500  * reclaim.
501  */
502 static bool
xfs_zone_gc_select_victim(struct xfs_zone_gc_data * data)503 xfs_zone_gc_select_victim(
504 	struct xfs_zone_gc_data	*data)
505 {
506 	struct xfs_zone_gc_iter	*iter = &data->iter;
507 	struct xfs_mount	*mp = data->mp;
508 	struct xfs_zone_info	*zi = mp->m_zone_info;
509 	struct xfs_rtgroup	*victim_rtg = NULL;
510 	unsigned int		bucket;
511 
512 	spin_lock(&zi->zi_used_buckets_lock);
513 	for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
514 		victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
515 		if (victim_rtg)
516 			break;
517 	}
518 	spin_unlock(&zi->zi_used_buckets_lock);
519 
520 	if (!victim_rtg)
521 		return false;
522 
523 	trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
524 	xfs_zone_gc_iter_init(iter, victim_rtg);
525 	return true;
526 }
527 
528 static int
xfs_zone_gc_steal_open_zone(struct xfs_zone_gc_data * data)529 xfs_zone_gc_steal_open_zone(
530 	struct xfs_zone_gc_data	*data)
531 {
532 	struct xfs_zone_info	*zi = data->mp->m_zone_info;
533 	struct xfs_open_zone	*oz, *found = NULL;
534 
535 	spin_lock(&zi->zi_open_zones_lock);
536 	list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
537 		if (!found || oz->oz_allocated < found->oz_allocated)
538 			found = oz;
539 	}
540 	if (!found) {
541 		spin_unlock(&zi->zi_open_zones_lock);
542 		return -EIO;
543 	}
544 
545 	trace_xfs_zone_gc_target_stolen(found->oz_rtg);
546 	found->oz_is_gc = true;
547 	zi->zi_nr_open_zones--;
548 	zi->zi_nr_open_gc_zones++;
549 	spin_unlock(&zi->zi_open_zones_lock);
550 
551 	atomic_inc(&found->oz_ref);
552 	data->oz = found;
553 	return 0;
554 }
555 
556 /*
557  * Ensure we have a valid open zone to write to.
558  */
559 static bool
xfs_zone_gc_select_target(struct xfs_zone_gc_data * data)560 xfs_zone_gc_select_target(
561 	struct xfs_zone_gc_data	*data)
562 {
563 	struct xfs_zone_info	*zi = data->mp->m_zone_info;
564 
565 	if (data->oz) {
566 		/*
567 		 * If we have space available, just keep using the existing
568 		 * zone.
569 		 */
570 		if (data->oz->oz_allocated < rtg_blocks(data->oz->oz_rtg))
571 			return true;
572 
573 		/*
574 		 * Wait for all writes to the current zone to finish before
575 		 * picking a new one.
576 		 */
577 		if (data->oz->oz_written < rtg_blocks(data->oz->oz_rtg))
578 			return false;
579 
580 		xfs_open_zone_put(data->oz);
581 	}
582 
583 	/*
584 	 * Open a new zone when there is none currently in use.
585 	 */
586 	ASSERT(zi->zi_nr_open_zones <=
587 		data->mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
588 	data->oz = xfs_open_zone(data->mp, WRITE_LIFE_NOT_SET, true);
589 	if (!data->oz)
590 		return false;
591 	trace_xfs_zone_gc_target_opened(data->oz->oz_rtg);
592 	atomic_inc(&data->oz->oz_ref);
593 	spin_lock(&zi->zi_open_zones_lock);
594 	zi->zi_nr_open_gc_zones++;
595 	list_add_tail(&data->oz->oz_entry, &zi->zi_open_zones);
596 	spin_unlock(&zi->zi_open_zones_lock);
597 	return true;
598 }
599 
600 static void
xfs_zone_gc_end_io(struct bio * bio)601 xfs_zone_gc_end_io(
602 	struct bio		*bio)
603 {
604 	struct xfs_gc_bio	*chunk =
605 		container_of(bio, struct xfs_gc_bio, bio);
606 	struct xfs_zone_gc_data	*data = chunk->data;
607 
608 	WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
609 	wake_up_process(data->mp->m_zone_info->zi_gc_thread);
610 }
611 
612 static bool
xfs_zone_gc_alloc_blocks(struct xfs_zone_gc_data * data,xfs_extlen_t * count_fsb,xfs_daddr_t * daddr,bool * is_seq)613 xfs_zone_gc_alloc_blocks(
614 	struct xfs_zone_gc_data	*data,
615 	xfs_extlen_t		*count_fsb,
616 	xfs_daddr_t		*daddr,
617 	bool			*is_seq)
618 {
619 	struct xfs_mount	*mp = data->mp;
620 	struct xfs_open_zone	*oz = data->oz;
621 
622 	*count_fsb = min(*count_fsb, XFS_B_TO_FSB(mp, data->scratch_available));
623 
624 	/*
625 	 * Directly allocate GC blocks from the reserved pool.
626 	 *
627 	 * If we'd take them from the normal pool we could be stealing blocks
628 	 * from a regular writer, which would then have to wait for GC and
629 	 * deadlock.
630 	 */
631 	spin_lock(&mp->m_sb_lock);
632 	*count_fsb = min(*count_fsb,
633 			rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
634 	*count_fsb = min3(*count_fsb,
635 			mp->m_free[XC_FREE_RTEXTENTS].res_avail,
636 			mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
637 	mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
638 	mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
639 	spin_unlock(&mp->m_sb_lock);
640 
641 	if (!*count_fsb)
642 		return false;
643 
644 	*daddr = xfs_gbno_to_daddr(rtg_group(oz->oz_rtg), 0);
645 	*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
646 	if (!*is_seq)
647 		*daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
648 	oz->oz_allocated += *count_fsb;
649 	atomic_inc(&oz->oz_ref);
650 	return true;
651 }
652 
653 static void
xfs_zone_gc_add_data(struct xfs_gc_bio * chunk)654 xfs_zone_gc_add_data(
655 	struct xfs_gc_bio	*chunk)
656 {
657 	struct xfs_zone_gc_data	*data = chunk->data;
658 	unsigned int		len = chunk->len;
659 	unsigned int		off = data->scratch_head;
660 
661 	do {
662 		unsigned int	this_off = off % XFS_GC_BUF_SIZE;
663 		unsigned int	this_len = min(len, XFS_GC_BUF_SIZE - this_off);
664 
665 		bio_add_folio_nofail(&chunk->bio,
666 				data->scratch_folios[off / XFS_GC_BUF_SIZE],
667 				this_len, this_off);
668 		len -= this_len;
669 		off += this_len;
670 		if (off == data->scratch_size)
671 			off = 0;
672 	} while (len);
673 }
674 
675 static bool
xfs_zone_gc_can_start_chunk(struct xfs_zone_gc_data * data)676 xfs_zone_gc_can_start_chunk(
677 	struct xfs_zone_gc_data	*data)
678 {
679 
680 	if (xfs_is_shutdown(data->mp))
681 		return false;
682 	if (!data->scratch_available)
683 		return false;
684 
685 	if (!data->iter.victim_rtg) {
686 		if (kthread_should_stop() || kthread_should_park())
687 			return false;
688 		if (!xfs_zoned_need_gc(data->mp))
689 			return false;
690 		if (!xfs_zone_gc_select_victim(data))
691 			return false;
692 	}
693 
694 	return xfs_zone_gc_select_target(data);
695 }
696 
697 static bool
xfs_zone_gc_start_chunk(struct xfs_zone_gc_data * data)698 xfs_zone_gc_start_chunk(
699 	struct xfs_zone_gc_data	*data)
700 {
701 	struct xfs_zone_gc_iter	*iter = &data->iter;
702 	struct xfs_mount	*mp = data->mp;
703 	struct block_device	*bdev = mp->m_rtdev_targp->bt_bdev;
704 	struct xfs_rmap_irec	irec;
705 	struct xfs_gc_bio	*chunk;
706 	struct xfs_inode	*ip;
707 	struct bio		*bio;
708 	xfs_daddr_t		daddr;
709 	bool			is_seq;
710 
711 	if (!xfs_zone_gc_can_start_chunk(data))
712 		return false;
713 
714 	set_current_state(TASK_RUNNING);
715 	if (!xfs_zone_gc_iter_irec(mp, iter, &irec, &ip))
716 		return false;
717 
718 	if (!xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
719 			&is_seq)) {
720 		xfs_irele(ip);
721 		return false;
722 	}
723 
724 	/*
725 	 * Scratch allocation can wrap around to the same buffer again,
726 	 * provision an extra bvec for that case.
727 	 */
728 	bio = bio_alloc_bioset(bdev, XFS_GC_NR_BUFS + 1, REQ_OP_READ, GFP_NOFS,
729 			&data->bio_set);
730 	chunk = container_of(bio, struct xfs_gc_bio, bio);
731 	chunk->ip = ip;
732 	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
733 	chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
734 	chunk->old_startblock =
735 		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
736 	chunk->new_daddr = daddr;
737 	chunk->is_seq = is_seq;
738 	chunk->data = data;
739 	chunk->oz = data->oz;
740 	chunk->victim_rtg = iter->victim_rtg;
741 	atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref);
742 	atomic_inc(&chunk->victim_rtg->rtg_gccount);
743 
744 	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
745 	bio->bi_end_io = xfs_zone_gc_end_io;
746 	xfs_zone_gc_add_data(chunk);
747 	data->scratch_head =
748 		(data->scratch_head + chunk->len) % data->scratch_size;
749 	data->scratch_available -= chunk->len;
750 
751 	XFS_STATS_INC(mp, xs_gc_read_calls);
752 
753 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
754 	list_add_tail(&chunk->entry, &data->reading);
755 	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
756 
757 	submit_bio(bio);
758 	return true;
759 }
760 
761 static void
xfs_zone_gc_free_chunk(struct xfs_gc_bio * chunk)762 xfs_zone_gc_free_chunk(
763 	struct xfs_gc_bio	*chunk)
764 {
765 	atomic_dec(&chunk->victim_rtg->rtg_gccount);
766 	xfs_rtgroup_rele(chunk->victim_rtg);
767 	list_del(&chunk->entry);
768 	xfs_open_zone_put(chunk->oz);
769 	xfs_irele(chunk->ip);
770 	bio_put(&chunk->bio);
771 }
772 
773 static void
xfs_zone_gc_submit_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)774 xfs_zone_gc_submit_write(
775 	struct xfs_zone_gc_data	*data,
776 	struct xfs_gc_bio	*chunk)
777 {
778 	if (chunk->is_seq) {
779 		chunk->bio.bi_opf &= ~REQ_OP_WRITE;
780 		chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
781 	}
782 	chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
783 	chunk->bio.bi_end_io = xfs_zone_gc_end_io;
784 	submit_bio(&chunk->bio);
785 }
786 
787 static struct xfs_gc_bio *
xfs_zone_gc_split_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)788 xfs_zone_gc_split_write(
789 	struct xfs_zone_gc_data	*data,
790 	struct xfs_gc_bio	*chunk)
791 {
792 	struct queue_limits	*lim =
793 		&bdev_get_queue(chunk->bio.bi_bdev)->limits;
794 	struct xfs_gc_bio	*split_chunk;
795 	int			split_sectors;
796 	unsigned int		split_len;
797 	struct bio		*split;
798 	unsigned int		nsegs;
799 
800 	if (!chunk->is_seq)
801 		return NULL;
802 
803 	split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
804 			lim->max_zone_append_sectors << SECTOR_SHIFT);
805 	if (!split_sectors)
806 		return NULL;
807 
808 	/* ensure the split chunk is still block size aligned */
809 	split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
810 			data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
811 	split_len = split_sectors << SECTOR_SHIFT;
812 
813 	split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
814 	split_chunk = container_of(split, struct xfs_gc_bio, bio);
815 	split_chunk->data = data;
816 	ihold(VFS_I(chunk->ip));
817 	split_chunk->ip = chunk->ip;
818 	split_chunk->is_seq = chunk->is_seq;
819 	split_chunk->offset = chunk->offset;
820 	split_chunk->len = split_len;
821 	split_chunk->old_startblock = chunk->old_startblock;
822 	split_chunk->new_daddr = chunk->new_daddr;
823 	split_chunk->oz = chunk->oz;
824 	atomic_inc(&chunk->oz->oz_ref);
825 
826 	split_chunk->victim_rtg = chunk->victim_rtg;
827 	atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref);
828 	atomic_inc(&chunk->victim_rtg->rtg_gccount);
829 
830 	chunk->offset += split_len;
831 	chunk->len -= split_len;
832 	chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
833 
834 	/* add right before the original chunk */
835 	WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
836 	list_add_tail(&split_chunk->entry, &chunk->entry);
837 	return split_chunk;
838 }
839 
840 static void
xfs_zone_gc_write_chunk(struct xfs_gc_bio * chunk)841 xfs_zone_gc_write_chunk(
842 	struct xfs_gc_bio	*chunk)
843 {
844 	struct xfs_zone_gc_data	*data = chunk->data;
845 	struct xfs_mount	*mp = chunk->ip->i_mount;
846 	struct xfs_gc_bio	*split_chunk;
847 
848 	if (chunk->bio.bi_status)
849 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
850 	if (xfs_is_shutdown(mp)) {
851 		xfs_zone_gc_free_chunk(chunk);
852 		return;
853 	}
854 
855 	XFS_STATS_INC(mp, xs_gc_write_calls);
856 	XFS_STATS_ADD(mp, xs_gc_bytes, chunk->len);
857 
858 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
859 	list_move_tail(&chunk->entry, &data->writing);
860 
861 	bio_reuse(&chunk->bio, REQ_OP_WRITE);
862 	while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
863 		xfs_zone_gc_submit_write(data, split_chunk);
864 	xfs_zone_gc_submit_write(data, chunk);
865 }
866 
867 static void
xfs_zone_gc_finish_chunk(struct xfs_gc_bio * chunk)868 xfs_zone_gc_finish_chunk(
869 	struct xfs_gc_bio	*chunk)
870 {
871 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
872 	struct xfs_zone_gc_data	*data = chunk->data;
873 	struct xfs_inode	*ip = chunk->ip;
874 	struct xfs_mount	*mp = ip->i_mount;
875 	int			error;
876 
877 	if (chunk->bio.bi_status)
878 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
879 	if (xfs_is_shutdown(mp)) {
880 		xfs_zone_gc_free_chunk(chunk);
881 		return;
882 	}
883 
884 	data->scratch_tail =
885 		(data->scratch_tail + chunk->len) % data->scratch_size;
886 	data->scratch_available += chunk->len;
887 
888 	/*
889 	 * Cycle through the iolock and wait for direct I/O and layouts to
890 	 * ensure no one is reading from the old mapping before it goes away.
891 	 *
892 	 * Note that xfs_zoned_end_io() below checks that no other writer raced
893 	 * with us to update the mapping by checking that the old startblock
894 	 * didn't change.
895 	 */
896 	xfs_ilock(ip, iolock);
897 	error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
898 	if (!error)
899 		inode_dio_wait(VFS_I(ip));
900 	xfs_iunlock(ip, iolock);
901 	if (error)
902 		goto free;
903 
904 	if (chunk->is_seq)
905 		chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
906 	error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
907 			chunk->new_daddr, chunk->oz, chunk->old_startblock);
908 free:
909 	if (error)
910 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
911 	xfs_zone_gc_free_chunk(chunk);
912 }
913 
914 static void
xfs_zone_gc_finish_reset(struct xfs_gc_bio * chunk)915 xfs_zone_gc_finish_reset(
916 	struct xfs_gc_bio	*chunk)
917 {
918 	struct xfs_rtgroup	*rtg = chunk->bio.bi_private;
919 	struct xfs_mount	*mp = rtg_mount(rtg);
920 	struct xfs_zone_info	*zi = mp->m_zone_info;
921 
922 	if (chunk->bio.bi_status) {
923 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
924 		goto out;
925 	}
926 
927 	xfs_group_set_mark(rtg_group(rtg), XFS_RTG_FREE);
928 	atomic_inc(&zi->zi_nr_free_zones);
929 
930 	xfs_zoned_add_available(mp, rtg_blocks(rtg));
931 
932 	wake_up_all(&zi->zi_zone_wait);
933 out:
934 	list_del(&chunk->entry);
935 	bio_put(&chunk->bio);
936 }
937 
938 static void
xfs_submit_zone_reset_bio(struct bio * bio,void * priv)939 xfs_submit_zone_reset_bio(
940 	struct bio		*bio,
941 	void			*priv)
942 {
943 	struct xfs_rtgroup	*rtg = priv;
944 	struct xfs_mount	*mp = rtg_mount(rtg);
945 
946 	trace_xfs_zone_reset(rtg);
947 
948 	ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
949 
950 	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ZONE_RESET)) {
951 		bio_io_error(bio);
952 		return;
953 	}
954 
955 	XFS_STATS_INC(mp, xs_gc_zone_reset_calls);
956 
957 	bio->bi_iter.bi_sector = xfs_gbno_to_daddr(rtg_group(rtg), 0);
958 	if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
959 		/*
960 		 * Also use the bio to drive the state machine when neither
961 		 * zone reset nor discard is supported to keep things simple.
962 		 */
963 		if (!bdev_max_discard_sectors(bio->bi_bdev)) {
964 			bio_endio(bio);
965 			return;
966 		}
967 		bio->bi_opf &= ~REQ_OP_ZONE_RESET;
968 		bio->bi_opf |= REQ_OP_DISCARD;
969 		bio->bi_iter.bi_size = XFS_FSB_TO_B(mp, rtg_blocks(rtg));
970 	}
971 
972 	submit_bio(bio);
973 }
974 
975 int
xfs_zone_gc_reset_sync(struct xfs_rtgroup * rtg)976 xfs_zone_gc_reset_sync(
977 	struct xfs_rtgroup	*rtg)
978 {
979 	struct bio		bio;
980 	int			error;
981 
982 	bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
983 			REQ_OP_ZONE_RESET | REQ_SYNC);
984 	bio_await(&bio, rtg, xfs_submit_zone_reset_bio);
985 	error = blk_status_to_errno(bio.bi_status);
986 	bio_uninit(&bio);
987 	return error;
988 }
989 
990 static void
xfs_zone_gc_reset_zones(struct xfs_zone_gc_data * data,struct xfs_group * reset_list)991 xfs_zone_gc_reset_zones(
992 	struct xfs_zone_gc_data	*data,
993 	struct xfs_group	*reset_list)
994 {
995 	struct xfs_group	*next = reset_list;
996 
997 	if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
998 		xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
999 		return;
1000 	}
1001 
1002 	do {
1003 		struct xfs_rtgroup	*rtg = to_rtg(next);
1004 		struct xfs_gc_bio	*chunk;
1005 		struct bio		*bio;
1006 
1007 		xfs_log_force_inode(rtg_rmap(rtg));
1008 
1009 		next = rtg_group(rtg)->xg_next_reset;
1010 		rtg_group(rtg)->xg_next_reset = NULL;
1011 
1012 		bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
1013 				0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
1014 		bio->bi_private = rtg;
1015 		bio->bi_end_io = xfs_zone_gc_end_io;
1016 
1017 		chunk = container_of(bio, struct xfs_gc_bio, bio);
1018 		chunk->data = data;
1019 		WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
1020 		list_add_tail(&chunk->entry, &data->resetting);
1021 		xfs_submit_zone_reset_bio(bio, rtg);
1022 	} while (next);
1023 }
1024 
1025 /*
1026  * Handle the work to read and write data for GC and to reset the zones,
1027  * including handling all completions.
1028  *
1029  * Note that the order of the chunks is preserved so that we don't undo the
1030  * optimal order established by xfs_zone_gc_query().
1031  */
1032 static void
xfs_zone_gc_handle_work(struct xfs_zone_gc_data * data)1033 xfs_zone_gc_handle_work(
1034 	struct xfs_zone_gc_data	*data)
1035 {
1036 	struct xfs_zone_info	*zi = data->mp->m_zone_info;
1037 	struct xfs_gc_bio	*chunk, *next;
1038 	struct xfs_group	*reset_list;
1039 	struct blk_plug		plug;
1040 
1041 	spin_lock(&zi->zi_reset_list_lock);
1042 	reset_list = zi->zi_reset_list;
1043 	zi->zi_reset_list = NULL;
1044 	spin_unlock(&zi->zi_reset_list_lock);
1045 
1046 	if (reset_list) {
1047 		set_current_state(TASK_RUNNING);
1048 		xfs_zone_gc_reset_zones(data, reset_list);
1049 	}
1050 
1051 	list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
1052 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1053 			break;
1054 		set_current_state(TASK_RUNNING);
1055 		xfs_zone_gc_finish_reset(chunk);
1056 	}
1057 
1058 	list_for_each_entry_safe(chunk, next, &data->writing, entry) {
1059 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1060 			break;
1061 		set_current_state(TASK_RUNNING);
1062 		xfs_zone_gc_finish_chunk(chunk);
1063 	}
1064 
1065 	blk_start_plug(&plug);
1066 	list_for_each_entry_safe(chunk, next, &data->reading, entry) {
1067 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1068 			break;
1069 		set_current_state(TASK_RUNNING);
1070 		xfs_zone_gc_write_chunk(chunk);
1071 	}
1072 	blk_finish_plug(&plug);
1073 
1074 	blk_start_plug(&plug);
1075 	while (xfs_zone_gc_start_chunk(data))
1076 		;
1077 	blk_finish_plug(&plug);
1078 }
1079 
1080 /*
1081  * Note that the current GC algorithm would break reflinks and thus duplicate
1082  * data that was shared by multiple owners before.  Because of that reflinks
1083  * are currently not supported on zoned file systems and can't be created or
1084  * mounted.
1085  */
1086 static int
xfs_zoned_gcd(void * private)1087 xfs_zoned_gcd(
1088 	void			*private)
1089 {
1090 	struct xfs_zone_gc_data	*data = private;
1091 	struct xfs_mount	*mp = data->mp;
1092 	struct xfs_zone_info	*zi = mp->m_zone_info;
1093 	unsigned int		nofs_flag;
1094 
1095 	nofs_flag = memalloc_nofs_save();
1096 	set_freezable();
1097 
1098 	for (;;) {
1099 		set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1100 		xfs_set_zonegc_running(mp);
1101 
1102 		xfs_zone_gc_handle_work(data);
1103 
1104 		/*
1105 		 * Only sleep if nothing set the state to running.  Else check for
1106 		 * work again as someone might have queued up more work and woken
1107 		 * us in the meantime.
1108 		 */
1109 		if (get_current_state() == TASK_RUNNING) {
1110 			try_to_freeze();
1111 			continue;
1112 		}
1113 
1114 		if (list_empty(&data->reading) &&
1115 		    list_empty(&data->writing) &&
1116 		    list_empty(&data->resetting) &&
1117 		    !zi->zi_reset_list) {
1118 			xfs_clear_zonegc_running(mp);
1119 			xfs_zoned_resv_wake_all(mp);
1120 
1121 			if (kthread_should_stop()) {
1122 				__set_current_state(TASK_RUNNING);
1123 				break;
1124 			}
1125 
1126 			if (kthread_should_park()) {
1127 				__set_current_state(TASK_RUNNING);
1128 				kthread_parkme();
1129 				continue;
1130 			}
1131 		}
1132 
1133 		schedule();
1134 	}
1135 	xfs_clear_zonegc_running(mp);
1136 
1137 	if (data->oz)
1138 		xfs_open_zone_put(data->oz);
1139 	if (data->iter.victim_rtg)
1140 		xfs_rtgroup_rele(data->iter.victim_rtg);
1141 
1142 	memalloc_nofs_restore(nofs_flag);
1143 	xfs_zone_gc_data_free(data);
1144 	return 0;
1145 }
1146 
1147 void
xfs_zone_gc_start(struct xfs_mount * mp)1148 xfs_zone_gc_start(
1149 	struct xfs_mount	*mp)
1150 {
1151 	if (xfs_has_zoned(mp))
1152 		kthread_unpark(mp->m_zone_info->zi_gc_thread);
1153 }
1154 
1155 void
xfs_zone_gc_stop(struct xfs_mount * mp)1156 xfs_zone_gc_stop(
1157 	struct xfs_mount	*mp)
1158 {
1159 	if (xfs_has_zoned(mp))
1160 		kthread_park(mp->m_zone_info->zi_gc_thread);
1161 }
1162 
1163 void
xfs_zone_gc_wakeup(struct xfs_mount * mp)1164 xfs_zone_gc_wakeup(
1165 	struct xfs_mount	*mp)
1166 {
1167 	struct super_block      *sb = mp->m_super;
1168 
1169 	/*
1170 	 * If we are unmounting the file system we must not try to
1171 	 * wake gc as m_zone_info might have been freed already.
1172 	 */
1173 	if (down_read_trylock(&sb->s_umount)) {
1174 		if (!xfs_is_readonly(mp))
1175 			wake_up_process(mp->m_zone_info->zi_gc_thread);
1176 		up_read(&sb->s_umount);
1177 	}
1178 }
1179 
1180 int
xfs_zone_gc_mount(struct xfs_mount * mp)1181 xfs_zone_gc_mount(
1182 	struct xfs_mount	*mp)
1183 {
1184 	struct xfs_zone_info	*zi = mp->m_zone_info;
1185 	struct xfs_zone_gc_data	*data;
1186 	int			error;
1187 
1188 	data = xfs_zone_gc_data_alloc(mp);
1189 	if (!data)
1190 		return -ENOMEM;
1191 
1192 	/*
1193 	 * If there are no free zones available for GC, or the number of open
1194 	 * zones has reached the open zone limit, pick the open zone with
1195 	 * the least used space to GC into.  This should only happen after an
1196 	 * unclean shutdown while GC was ongoing.  Otherwise a GC zone will
1197 	 * be selected from the free zone pool on demand.
1198 	 */
1199 	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
1200 	    zi->zi_nr_open_zones >= mp->m_max_open_zones) {
1201 		error = xfs_zone_gc_steal_open_zone(data);
1202 		if (error) {
1203 			xfs_warn(mp, "unable to steal an open zone for gc");
1204 			goto out_free_gc_data;
1205 		}
1206 	}
1207 
1208 	zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
1209 			"xfs-zone-gc/%s", mp->m_super->s_id);
1210 	if (IS_ERR(zi->zi_gc_thread)) {
1211 		xfs_warn(mp, "unable to create zone gc thread");
1212 		error = PTR_ERR(zi->zi_gc_thread);
1213 		goto out_put_oz;
1214 	}
1215 
1216 	/* xfs_zone_gc_start will unpark for rw mounts */
1217 	kthread_park(zi->zi_gc_thread);
1218 	return 0;
1219 
1220 out_put_oz:
1221 	if (data->oz)
1222 		xfs_open_zone_put(data->oz);
1223 out_free_gc_data:
1224 	kfree(data);
1225 	return error;
1226 }
1227 
1228 void
xfs_zone_gc_unmount(struct xfs_mount * mp)1229 xfs_zone_gc_unmount(
1230 	struct xfs_mount	*mp)
1231 {
1232 	struct xfs_zone_info	*zi = mp->m_zone_info;
1233 
1234 	kthread_stop(zi->zi_gc_thread);
1235 }
1236