xref: /linux/fs/xfs/xfs_zone_gc.c (revision 399af66228cfd7df79dc360810b6b673000f8090)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2023-2025 Christoph Hellwig.
4  * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5  */
6 #include "xfs_platform.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "xfs_trans.h"
15 #include "xfs_icache.h"
16 #include "xfs_rmap.h"
17 #include "xfs_rtbitmap.h"
18 #include "xfs_rtrmap_btree.h"
19 #include "xfs_errortag.h"
20 #include "xfs_error.h"
21 #include "xfs_zone_alloc.h"
22 #include "xfs_zone_priv.h"
23 #include "xfs_zones.h"
24 #include "xfs_trace.h"
25 
26 /*
27  * Implement Garbage Collection (GC) of partially used zoned.
28  *
29  * To support the purely sequential writes in each zone, zoned XFS needs to be
30  * able to move data remaining in a zone out of it to reset the zone to prepare
31  * for writing to it again.
32  *
33  * This is done by the GC thread implemented in this file.  To support that a
34  * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
35  * write the garbage collected data into.
36  *
37  * Whenever the available space is below the chosen threshold, the GC thread
38  * looks for potential non-empty but not fully used zones that are worth
39  * reclaiming.  Once found the rmap for the victim zone is queried, and after
40  * a bit of sorting to reduce fragmentation, the still live extents are read
41  * into memory and written to the GC target zone, and the bmap btree of the
42  * files is updated to point to the new location.  To avoid taking the IOLOCK
43  * and MMAPLOCK for the entire GC process and thus affecting the latency of
44  * user reads and writes to the files, the GC writes are speculative and the
45  * I/O completion checks that no other writes happened for the affected regions
46  * before remapping.
47  *
48  * Once a zone does not contain any valid data, be that through GC or user
49  * block removal, it is queued for for a zone reset.  The reset operation
50  * carefully ensures that the RT device cache is flushed and all transactions
51  * referencing the rmap have been committed to disk.
52  */
53 
54 /*
55  * Size of each GC scratch allocation, and the number of buffers.
56  */
57 #define XFS_GC_BUF_SIZE		SZ_1M
58 #define XFS_GC_NR_BUFS		2
59 static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS);
60 
61 /*
62  * Chunk that is read and written for each GC operation.
63  *
64  * Note that for writes to actual zoned devices, the chunk can be split when
65  * reaching the hardware limit.
66  */
67 struct xfs_gc_bio {
68 	struct xfs_zone_gc_data		*data;
69 
70 	/*
71 	 * Entry into the reading/writing/resetting list.  Only accessed from
72 	 * the GC thread, so no locking needed.
73 	 */
74 	struct list_head		entry;
75 
76 	/*
77 	 * State of this gc_bio.  Done means the current I/O completed.
78 	 * Set from the bio end I/O handler, read from the GC thread.
79 	 */
80 	enum {
81 		XFS_GC_BIO_NEW,
82 		XFS_GC_BIO_DONE,
83 	} state;
84 
85 	/*
86 	 * Pointer to the inode and byte range in the inode that this
87 	 * GC chunk is operating on.
88 	 */
89 	struct xfs_inode		*ip;
90 	loff_t				offset;
91 	unsigned int			len;
92 
93 	/*
94 	 * Existing startblock (in the zone to be freed) and newly assigned
95 	 * daddr in the zone GCed into.
96 	 */
97 	xfs_fsblock_t			old_startblock;
98 	xfs_daddr_t			new_daddr;
99 
100 	/* Are we writing to a sequential write required zone? */
101 	bool				is_seq;
102 
103 	/* Open Zone being written to */
104 	struct xfs_open_zone		*oz;
105 
106 	struct xfs_rtgroup		*victim_rtg;
107 
108 	/* Bio used for reads and writes, including the bvec used by it */
109 	struct bio			bio;	/* must be last */
110 };
111 
112 #define XFS_ZONE_GC_RECS		1024
113 
114 /* iterator, needs to be reinitialized for each victim zone */
115 struct xfs_zone_gc_iter {
116 	struct xfs_rtgroup		*victim_rtg;
117 	unsigned int			rec_count;
118 	unsigned int			rec_idx;
119 	xfs_agblock_t			next_startblock;
120 	struct xfs_rmap_irec		*recs;
121 };
122 
123 /*
124  * Per-mount GC state.
125  */
126 struct xfs_zone_gc_data {
127 	struct xfs_mount		*mp;
128 
129 	/* bioset used to allocate the gc_bios */
130 	struct bio_set			bio_set;
131 
132 	/*
133 	 * Scratchpad to buffer GC data, organized as a ring buffer over
134 	 * discontiguous folios.  scratch_head is where the buffer is filled,
135 	 * scratch_tail tracks the buffer space freed, and scratch_available
136 	 * counts the space available in the ring buffer between the head and
137 	 * the tail.
138 	 */
139 	struct folio			*scratch_folios[XFS_GC_NR_BUFS];
140 	unsigned int			scratch_size;
141 	unsigned int			scratch_available;
142 	unsigned int			scratch_head;
143 	unsigned int			scratch_tail;
144 
145 	/*
146 	 * List of bios currently being read, written and reset.
147 	 * These lists are only accessed by the GC thread itself, and must only
148 	 * be processed in order.
149 	 */
150 	struct list_head		reading;
151 	struct list_head		writing;
152 	struct list_head		resetting;
153 
154 	/*
155 	 * Iterator for the victim zone.
156 	 */
157 	struct xfs_zone_gc_iter		iter;
158 };
159 
160 /*
161  * We aim to keep enough zones free in stock to fully use the open zone limit
162  * for data placement purposes. Additionally, the m_zonegc_low_space tunable
163  * can be set to make sure a fraction of the unused blocks are available for
164  * writing.
165  */
166 bool
xfs_zoned_need_gc(struct xfs_mount * mp)167 xfs_zoned_need_gc(
168 	struct xfs_mount	*mp)
169 {
170 	s64			available, free, threshold;
171 	s32			remainder;
172 
173 	if (!xfs_zoned_have_reclaimable(mp->m_zone_info))
174 		return false;
175 
176 	available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
177 
178 	if (available <
179 	    xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
180 		return true;
181 
182 	free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
183 
184 	threshold = div_s64_rem(free, 100, &remainder);
185 	threshold = threshold * mp->m_zonegc_low_space +
186 		    remainder * div_s64(mp->m_zonegc_low_space, 100);
187 
188 	if (available < threshold)
189 		return true;
190 
191 	return false;
192 }
193 
194 static struct xfs_zone_gc_data *
xfs_zone_gc_data_alloc(struct xfs_mount * mp)195 xfs_zone_gc_data_alloc(
196 	struct xfs_mount	*mp)
197 {
198 	struct xfs_zone_gc_data	*data;
199 	int			i;
200 
201 	data = kzalloc_obj(*data);
202 	if (!data)
203 		return NULL;
204 	data->iter.recs = kzalloc_objs(*data->iter.recs, XFS_ZONE_GC_RECS);
205 	if (!data->iter.recs)
206 		goto out_free_data;
207 
208 	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
209 			BIOSET_NEED_BVECS))
210 		goto out_free_recs;
211 	for (i = 0; i < XFS_GC_NR_BUFS; i++) {
212 		data->scratch_folios[i] =
213 			folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE));
214 		if (!data->scratch_folios[i])
215 			goto out_free_scratch;
216 	}
217 	data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS;
218 	data->scratch_available = data->scratch_size;
219 	INIT_LIST_HEAD(&data->reading);
220 	INIT_LIST_HEAD(&data->writing);
221 	INIT_LIST_HEAD(&data->resetting);
222 	data->mp = mp;
223 	return data;
224 
225 out_free_scratch:
226 	while (--i >= 0)
227 		folio_put(data->scratch_folios[i]);
228 	bioset_exit(&data->bio_set);
229 out_free_recs:
230 	kfree(data->iter.recs);
231 out_free_data:
232 	kfree(data);
233 	return NULL;
234 }
235 
236 static void
xfs_zone_gc_data_free(struct xfs_zone_gc_data * data)237 xfs_zone_gc_data_free(
238 	struct xfs_zone_gc_data	*data)
239 {
240 	int			i;
241 
242 	for (i = 0; i < XFS_GC_NR_BUFS; i++)
243 		folio_put(data->scratch_folios[i]);
244 	bioset_exit(&data->bio_set);
245 	kfree(data->iter.recs);
246 	kfree(data);
247 }
248 
249 static void
xfs_zone_gc_iter_init(struct xfs_zone_gc_iter * iter,struct xfs_rtgroup * victim_rtg)250 xfs_zone_gc_iter_init(
251 	struct xfs_zone_gc_iter	*iter,
252 	struct xfs_rtgroup	*victim_rtg)
253 
254 {
255 	iter->next_startblock = 0;
256 	iter->rec_count = 0;
257 	iter->rec_idx = 0;
258 	iter->victim_rtg = victim_rtg;
259 	atomic_inc(&victim_rtg->rtg_gccount);
260 }
261 
262 /*
263  * Query the rmap of the victim zone to gather the records to evacuate.
264  */
265 static int
xfs_zone_gc_query_cb(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * irec,void * private)266 xfs_zone_gc_query_cb(
267 	struct xfs_btree_cur	*cur,
268 	const struct xfs_rmap_irec *irec,
269 	void			*private)
270 {
271 	struct xfs_zone_gc_iter	*iter = private;
272 
273 	ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
274 	ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
275 	ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
276 
277 	iter->recs[iter->rec_count] = *irec;
278 	if (++iter->rec_count == XFS_ZONE_GC_RECS) {
279 		iter->next_startblock =
280 			irec->rm_startblock + irec->rm_blockcount;
281 		return 1;
282 	}
283 	return 0;
284 }
285 
286 static int
xfs_zone_gc_rmap_rec_cmp(const void * a,const void * b)287 xfs_zone_gc_rmap_rec_cmp(
288 	const void			*a,
289 	const void			*b)
290 {
291 	const struct xfs_rmap_irec	*reca = a;
292 	const struct xfs_rmap_irec	*recb = b;
293 	int				diff;
294 
295 	diff = cmp_int(reca->rm_owner, recb->rm_owner);
296 	if (diff)
297 		return diff;
298 	return cmp_int(reca->rm_offset, recb->rm_offset);
299 }
300 
301 static int
xfs_zone_gc_query(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter)302 xfs_zone_gc_query(
303 	struct xfs_mount	*mp,
304 	struct xfs_zone_gc_iter	*iter)
305 {
306 	struct xfs_rtgroup	*rtg = iter->victim_rtg;
307 	struct xfs_rmap_irec	ri_low = { };
308 	struct xfs_rmap_irec	ri_high;
309 	struct xfs_btree_cur	*cur;
310 	struct xfs_trans	*tp;
311 	int			error;
312 
313 	ASSERT(iter->next_startblock <= rtg_blocks(rtg));
314 	if (iter->next_startblock == rtg_blocks(rtg))
315 		goto done;
316 
317 	ASSERT(iter->next_startblock < rtg_blocks(rtg));
318 	ri_low.rm_startblock = iter->next_startblock;
319 	memset(&ri_high, 0xFF, sizeof(ri_high));
320 
321 	iter->rec_idx = 0;
322 	iter->rec_count = 0;
323 
324 	tp = xfs_trans_alloc_empty(mp);
325 	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
326 	cur = xfs_rtrmapbt_init_cursor(tp, rtg);
327 	error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
328 			xfs_zone_gc_query_cb, iter);
329 	xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
330 	xfs_btree_del_cursor(cur, error < 0 ? error : 0);
331 	xfs_trans_cancel(tp);
332 
333 	if (error < 0)
334 		return error;
335 
336 	/*
337 	 * Sort the rmap records by inode number and increasing offset to
338 	 * defragment the mappings.
339 	 *
340 	 * This could be further enhanced by an even bigger look ahead window,
341 	 * but that's better left until we have better detection of changes to
342 	 * inode mapping to avoid the potential of GCing already dead data.
343 	 */
344 	sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
345 			xfs_zone_gc_rmap_rec_cmp, NULL);
346 
347 	if (error == 0) {
348 		/*
349 		 * We finished iterating through the zone.
350 		 */
351 		iter->next_startblock = rtg_blocks(rtg);
352 		if (iter->rec_count == 0)
353 			goto done;
354 	}
355 
356 	return 0;
357 done:
358 	atomic_dec(&iter->victim_rtg->rtg_gccount);
359 	xfs_rtgroup_rele(iter->victim_rtg);
360 	iter->victim_rtg = NULL;
361 	return 0;
362 }
363 
364 static bool
xfs_zone_gc_iter_next(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter,struct xfs_rmap_irec * chunk_rec,struct xfs_inode ** ipp)365 xfs_zone_gc_iter_next(
366 	struct xfs_mount	*mp,
367 	struct xfs_zone_gc_iter	*iter,
368 	struct xfs_rmap_irec	*chunk_rec,
369 	struct xfs_inode	**ipp)
370 {
371 	struct xfs_rmap_irec	*irec;
372 	int			error;
373 
374 	if (!iter->victim_rtg)
375 		return false;
376 
377 retry:
378 	if (iter->rec_idx == iter->rec_count) {
379 		error = xfs_zone_gc_query(mp, iter);
380 		if (error)
381 			goto fail;
382 		if (!iter->victim_rtg)
383 			return false;
384 	}
385 
386 	irec = &iter->recs[iter->rec_idx];
387 	error = xfs_iget(mp, NULL, irec->rm_owner,
388 			XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
389 	if (error) {
390 		/*
391 		 * If the inode was already deleted, skip over it.
392 		 */
393 		if (error == -ENOENT) {
394 			iter->rec_idx++;
395 			goto retry;
396 		}
397 		goto fail;
398 	}
399 
400 	if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
401 		iter->rec_idx++;
402 		xfs_irele(*ipp);
403 		goto retry;
404 	}
405 
406 	*chunk_rec = *irec;
407 	return true;
408 
409 fail:
410 	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
411 	return false;
412 }
413 
414 static void
xfs_zone_gc_iter_advance(struct xfs_zone_gc_iter * iter,xfs_extlen_t count_fsb)415 xfs_zone_gc_iter_advance(
416 	struct xfs_zone_gc_iter	*iter,
417 	xfs_extlen_t		count_fsb)
418 {
419 	struct xfs_rmap_irec	*irec = &iter->recs[iter->rec_idx];
420 
421 	irec->rm_offset += count_fsb;
422 	irec->rm_startblock += count_fsb;
423 	irec->rm_blockcount -= count_fsb;
424 	if (!irec->rm_blockcount)
425 		iter->rec_idx++;
426 }
427 
428 static struct xfs_rtgroup *
xfs_zone_gc_pick_victim_from(struct xfs_mount * mp,uint32_t bucket)429 xfs_zone_gc_pick_victim_from(
430 	struct xfs_mount	*mp,
431 	uint32_t		bucket)
432 {
433 	struct xfs_zone_info	*zi = mp->m_zone_info;
434 	uint32_t		victim_used = U32_MAX;
435 	struct xfs_rtgroup	*victim_rtg = NULL;
436 	uint32_t		bit;
437 
438 	if (!zi->zi_used_bucket_entries[bucket])
439 		return NULL;
440 
441 	for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
442 			mp->m_sb.sb_rgcount) {
443 		struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
444 
445 		if (!rtg)
446 			continue;
447 
448 		/*
449 		 * If the zone is already undergoing GC, don't pick it again.
450 		 *
451 		 * This prevents us from picking one of the zones for which we
452 		 * already submitted GC I/O, but for which the remapping hasn't
453 		 * concluded yet.  This won't cause data corruption, but
454 		 * increases write amplification and slows down GC, so this is
455 		 * a bad thing.
456 		 */
457 		if (atomic_read(&rtg->rtg_gccount)) {
458 			xfs_rtgroup_rele(rtg);
459 			continue;
460 		}
461 
462 		/* skip zones that are just waiting for a reset */
463 		if (rtg_rmap(rtg)->i_used_blocks == 0 ||
464 		    rtg_rmap(rtg)->i_used_blocks >= victim_used) {
465 			xfs_rtgroup_rele(rtg);
466 			continue;
467 		}
468 
469 		if (victim_rtg)
470 			xfs_rtgroup_rele(victim_rtg);
471 		victim_rtg = rtg;
472 		victim_used = rtg_rmap(rtg)->i_used_blocks;
473 
474 		/*
475 		 * Any zone that is less than 1 percent used is fair game for
476 		 * instant reclaim. All of these zones are in the last
477 		 * bucket, so avoid the expensive division for the zones
478 		 * in the other buckets.
479 		 */
480 		if (bucket == 0 &&
481 		    rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
482 			break;
483 	}
484 
485 	return victim_rtg;
486 }
487 
488 /*
489  * Iterate through all zones marked as reclaimable and find a candidate to
490  * reclaim.
491  */
492 static bool
xfs_zone_gc_select_victim(struct xfs_zone_gc_data * data)493 xfs_zone_gc_select_victim(
494 	struct xfs_zone_gc_data	*data)
495 {
496 	struct xfs_zone_gc_iter	*iter = &data->iter;
497 	struct xfs_mount	*mp = data->mp;
498 	struct xfs_zone_info	*zi = mp->m_zone_info;
499 	struct xfs_rtgroup	*victim_rtg = NULL;
500 	unsigned int		bucket;
501 
502 	spin_lock(&zi->zi_used_buckets_lock);
503 	for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
504 		victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
505 		if (victim_rtg)
506 			break;
507 	}
508 	spin_unlock(&zi->zi_used_buckets_lock);
509 
510 	if (!victim_rtg)
511 		return false;
512 
513 	trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
514 	xfs_zone_gc_iter_init(iter, victim_rtg);
515 	return true;
516 }
517 
518 static struct xfs_open_zone *
xfs_zone_gc_steal_open(struct xfs_zone_info * zi)519 xfs_zone_gc_steal_open(
520 	struct xfs_zone_info	*zi)
521 {
522 	struct xfs_open_zone	*oz, *found = NULL;
523 
524 	spin_lock(&zi->zi_open_zones_lock);
525 	list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
526 		if (!found || oz->oz_allocated < found->oz_allocated)
527 			found = oz;
528 	}
529 
530 	if (found) {
531 		found->oz_is_gc = true;
532 		list_del_init(&found->oz_entry);
533 		zi->zi_nr_open_zones--;
534 	}
535 
536 	spin_unlock(&zi->zi_open_zones_lock);
537 	return found;
538 }
539 
540 static struct xfs_open_zone *
xfs_zone_gc_select_target(struct xfs_mount * mp)541 xfs_zone_gc_select_target(
542 	struct xfs_mount	*mp)
543 {
544 	struct xfs_zone_info	*zi = mp->m_zone_info;
545 	struct xfs_open_zone	*oz = zi->zi_open_gc_zone;
546 
547 	/*
548 	 * We need to wait for pending writes to finish.
549 	 */
550 	if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
551 		return NULL;
552 
553 	ASSERT(zi->zi_nr_open_zones <=
554 		mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
555 	oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
556 	if (oz)
557 		trace_xfs_zone_gc_target_opened(oz->oz_rtg);
558 	spin_lock(&zi->zi_open_zones_lock);
559 	zi->zi_open_gc_zone = oz;
560 	spin_unlock(&zi->zi_open_zones_lock);
561 	return oz;
562 }
563 
564 /*
565  * Ensure we have a valid open zone to write the GC data to.
566  *
567  * If the current target zone has space keep writing to it, else first wait for
568  * all pending writes and then pick a new one.
569  */
570 static struct xfs_open_zone *
xfs_zone_gc_ensure_target(struct xfs_mount * mp)571 xfs_zone_gc_ensure_target(
572 	struct xfs_mount	*mp)
573 {
574 	struct xfs_open_zone	*oz = mp->m_zone_info->zi_open_gc_zone;
575 
576 	if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
577 		return xfs_zone_gc_select_target(mp);
578 	return oz;
579 }
580 
581 static void
xfs_zone_gc_end_io(struct bio * bio)582 xfs_zone_gc_end_io(
583 	struct bio		*bio)
584 {
585 	struct xfs_gc_bio	*chunk =
586 		container_of(bio, struct xfs_gc_bio, bio);
587 	struct xfs_zone_gc_data	*data = chunk->data;
588 
589 	WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
590 	wake_up_process(data->mp->m_zone_info->zi_gc_thread);
591 }
592 
593 static struct xfs_open_zone *
xfs_zone_gc_alloc_blocks(struct xfs_zone_gc_data * data,xfs_extlen_t * count_fsb,xfs_daddr_t * daddr,bool * is_seq)594 xfs_zone_gc_alloc_blocks(
595 	struct xfs_zone_gc_data	*data,
596 	xfs_extlen_t		*count_fsb,
597 	xfs_daddr_t		*daddr,
598 	bool			*is_seq)
599 {
600 	struct xfs_mount	*mp = data->mp;
601 	struct xfs_open_zone	*oz;
602 
603 	oz = xfs_zone_gc_ensure_target(mp);
604 	if (!oz)
605 		return NULL;
606 
607 	*count_fsb = min(*count_fsb, XFS_B_TO_FSB(mp, data->scratch_available));
608 
609 	/*
610 	 * Directly allocate GC blocks from the reserved pool.
611 	 *
612 	 * If we'd take them from the normal pool we could be stealing blocks
613 	 * from a regular writer, which would then have to wait for GC and
614 	 * deadlock.
615 	 */
616 	spin_lock(&mp->m_sb_lock);
617 	*count_fsb = min(*count_fsb,
618 			rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
619 	*count_fsb = min3(*count_fsb,
620 			mp->m_free[XC_FREE_RTEXTENTS].res_avail,
621 			mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
622 	mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
623 	mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
624 	spin_unlock(&mp->m_sb_lock);
625 
626 	if (!*count_fsb)
627 		return NULL;
628 
629 	*daddr = xfs_gbno_to_daddr(rtg_group(oz->oz_rtg), 0);
630 	*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
631 	if (!*is_seq)
632 		*daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
633 	oz->oz_allocated += *count_fsb;
634 	atomic_inc(&oz->oz_ref);
635 	return oz;
636 }
637 
638 static void
xfs_zone_gc_add_data(struct xfs_gc_bio * chunk)639 xfs_zone_gc_add_data(
640 	struct xfs_gc_bio	*chunk)
641 {
642 	struct xfs_zone_gc_data	*data = chunk->data;
643 	unsigned int		len = chunk->len;
644 	unsigned int		off = data->scratch_head;
645 
646 	do {
647 		unsigned int	this_off = off % XFS_GC_BUF_SIZE;
648 		unsigned int	this_len = min(len, XFS_GC_BUF_SIZE - this_off);
649 
650 		bio_add_folio_nofail(&chunk->bio,
651 				data->scratch_folios[off / XFS_GC_BUF_SIZE],
652 				this_len, this_off);
653 		len -= this_len;
654 		off += this_len;
655 		if (off == data->scratch_size)
656 			off = 0;
657 	} while (len);
658 }
659 
660 static bool
xfs_zone_gc_start_chunk(struct xfs_zone_gc_data * data)661 xfs_zone_gc_start_chunk(
662 	struct xfs_zone_gc_data	*data)
663 {
664 	struct xfs_zone_gc_iter	*iter = &data->iter;
665 	struct xfs_mount	*mp = data->mp;
666 	struct block_device	*bdev = mp->m_rtdev_targp->bt_bdev;
667 	struct xfs_open_zone	*oz;
668 	struct xfs_rmap_irec	irec;
669 	struct xfs_gc_bio	*chunk;
670 	struct xfs_inode	*ip;
671 	struct bio		*bio;
672 	xfs_daddr_t		daddr;
673 	unsigned int		len;
674 	bool			is_seq;
675 
676 	if (xfs_is_shutdown(mp))
677 		return false;
678 
679 	if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
680 		return false;
681 	oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
682 			&is_seq);
683 	if (!oz) {
684 		xfs_irele(ip);
685 		return false;
686 	}
687 
688 	len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
689 	bio = bio_alloc_bioset(bdev,
690 			min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS),
691 			REQ_OP_READ, GFP_NOFS, &data->bio_set);
692 
693 	chunk = container_of(bio, struct xfs_gc_bio, bio);
694 	chunk->ip = ip;
695 	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
696 	chunk->len = len;
697 	chunk->old_startblock =
698 		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
699 	chunk->new_daddr = daddr;
700 	chunk->is_seq = is_seq;
701 	chunk->data = data;
702 	chunk->oz = oz;
703 	chunk->victim_rtg = iter->victim_rtg;
704 	atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref);
705 	atomic_inc(&chunk->victim_rtg->rtg_gccount);
706 
707 	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
708 	bio->bi_end_io = xfs_zone_gc_end_io;
709 	xfs_zone_gc_add_data(chunk);
710 	data->scratch_head = (data->scratch_head + len) % data->scratch_size;
711 	data->scratch_available -= len;
712 
713 	XFS_STATS_INC(mp, xs_gc_read_calls);
714 
715 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
716 	list_add_tail(&chunk->entry, &data->reading);
717 	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
718 
719 	submit_bio(bio);
720 	return true;
721 }
722 
723 static void
xfs_zone_gc_free_chunk(struct xfs_gc_bio * chunk)724 xfs_zone_gc_free_chunk(
725 	struct xfs_gc_bio	*chunk)
726 {
727 	atomic_dec(&chunk->victim_rtg->rtg_gccount);
728 	xfs_rtgroup_rele(chunk->victim_rtg);
729 	list_del(&chunk->entry);
730 	xfs_open_zone_put(chunk->oz);
731 	xfs_irele(chunk->ip);
732 	bio_put(&chunk->bio);
733 }
734 
735 static void
xfs_zone_gc_submit_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)736 xfs_zone_gc_submit_write(
737 	struct xfs_zone_gc_data	*data,
738 	struct xfs_gc_bio	*chunk)
739 {
740 	if (chunk->is_seq) {
741 		chunk->bio.bi_opf &= ~REQ_OP_WRITE;
742 		chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
743 	}
744 	chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
745 	chunk->bio.bi_end_io = xfs_zone_gc_end_io;
746 	submit_bio(&chunk->bio);
747 }
748 
749 static struct xfs_gc_bio *
xfs_zone_gc_split_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)750 xfs_zone_gc_split_write(
751 	struct xfs_zone_gc_data	*data,
752 	struct xfs_gc_bio	*chunk)
753 {
754 	struct queue_limits	*lim =
755 		&bdev_get_queue(chunk->bio.bi_bdev)->limits;
756 	struct xfs_gc_bio	*split_chunk;
757 	int			split_sectors;
758 	unsigned int		split_len;
759 	struct bio		*split;
760 	unsigned int		nsegs;
761 
762 	if (!chunk->is_seq)
763 		return NULL;
764 
765 	split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
766 			lim->max_zone_append_sectors << SECTOR_SHIFT);
767 	if (!split_sectors)
768 		return NULL;
769 
770 	/* ensure the split chunk is still block size aligned */
771 	split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
772 			data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
773 	split_len = split_sectors << SECTOR_SHIFT;
774 
775 	split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
776 	split_chunk = container_of(split, struct xfs_gc_bio, bio);
777 	split_chunk->data = data;
778 	ihold(VFS_I(chunk->ip));
779 	split_chunk->ip = chunk->ip;
780 	split_chunk->is_seq = chunk->is_seq;
781 	split_chunk->offset = chunk->offset;
782 	split_chunk->len = split_len;
783 	split_chunk->old_startblock = chunk->old_startblock;
784 	split_chunk->new_daddr = chunk->new_daddr;
785 	split_chunk->oz = chunk->oz;
786 	atomic_inc(&chunk->oz->oz_ref);
787 
788 	split_chunk->victim_rtg = chunk->victim_rtg;
789 	atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref);
790 	atomic_inc(&chunk->victim_rtg->rtg_gccount);
791 
792 	chunk->offset += split_len;
793 	chunk->len -= split_len;
794 	chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
795 
796 	/* add right before the original chunk */
797 	WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
798 	list_add_tail(&split_chunk->entry, &chunk->entry);
799 	return split_chunk;
800 }
801 
802 static void
xfs_zone_gc_write_chunk(struct xfs_gc_bio * chunk)803 xfs_zone_gc_write_chunk(
804 	struct xfs_gc_bio	*chunk)
805 {
806 	struct xfs_zone_gc_data	*data = chunk->data;
807 	struct xfs_mount	*mp = chunk->ip->i_mount;
808 	struct xfs_gc_bio	*split_chunk;
809 
810 	if (chunk->bio.bi_status)
811 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
812 	if (xfs_is_shutdown(mp)) {
813 		xfs_zone_gc_free_chunk(chunk);
814 		return;
815 	}
816 
817 	XFS_STATS_INC(mp, xs_gc_write_calls);
818 	XFS_STATS_ADD(mp, xs_gc_bytes, chunk->len);
819 
820 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
821 	list_move_tail(&chunk->entry, &data->writing);
822 
823 	bio_reuse(&chunk->bio, REQ_OP_WRITE);
824 	while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
825 		xfs_zone_gc_submit_write(data, split_chunk);
826 	xfs_zone_gc_submit_write(data, chunk);
827 }
828 
829 static void
xfs_zone_gc_finish_chunk(struct xfs_gc_bio * chunk)830 xfs_zone_gc_finish_chunk(
831 	struct xfs_gc_bio	*chunk)
832 {
833 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
834 	struct xfs_zone_gc_data	*data = chunk->data;
835 	struct xfs_inode	*ip = chunk->ip;
836 	struct xfs_mount	*mp = ip->i_mount;
837 	int			error;
838 
839 	if (chunk->bio.bi_status)
840 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
841 	if (xfs_is_shutdown(mp)) {
842 		xfs_zone_gc_free_chunk(chunk);
843 		return;
844 	}
845 
846 	data->scratch_tail =
847 		(data->scratch_tail + chunk->len) % data->scratch_size;
848 	data->scratch_available += chunk->len;
849 
850 	/*
851 	 * Cycle through the iolock and wait for direct I/O and layouts to
852 	 * ensure no one is reading from the old mapping before it goes away.
853 	 *
854 	 * Note that xfs_zoned_end_io() below checks that no other writer raced
855 	 * with us to update the mapping by checking that the old startblock
856 	 * didn't change.
857 	 */
858 	xfs_ilock(ip, iolock);
859 	error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
860 	if (!error)
861 		inode_dio_wait(VFS_I(ip));
862 	xfs_iunlock(ip, iolock);
863 	if (error)
864 		goto free;
865 
866 	if (chunk->is_seq)
867 		chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
868 	error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
869 			chunk->new_daddr, chunk->oz, chunk->old_startblock);
870 free:
871 	if (error)
872 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
873 	xfs_zone_gc_free_chunk(chunk);
874 }
875 
876 static void
xfs_zone_gc_finish_reset(struct xfs_gc_bio * chunk)877 xfs_zone_gc_finish_reset(
878 	struct xfs_gc_bio	*chunk)
879 {
880 	struct xfs_rtgroup	*rtg = chunk->bio.bi_private;
881 	struct xfs_mount	*mp = rtg_mount(rtg);
882 	struct xfs_zone_info	*zi = mp->m_zone_info;
883 
884 	if (chunk->bio.bi_status) {
885 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
886 		goto out;
887 	}
888 
889 	xfs_group_set_mark(rtg_group(rtg), XFS_RTG_FREE);
890 	atomic_inc(&zi->zi_nr_free_zones);
891 
892 	xfs_zoned_add_available(mp, rtg_blocks(rtg));
893 
894 	wake_up_all(&zi->zi_zone_wait);
895 out:
896 	list_del(&chunk->entry);
897 	bio_put(&chunk->bio);
898 }
899 
900 static void
xfs_submit_zone_reset_bio(struct xfs_rtgroup * rtg,struct bio * bio)901 xfs_submit_zone_reset_bio(
902 	struct xfs_rtgroup	*rtg,
903 	struct bio		*bio)
904 {
905 	struct xfs_mount	*mp = rtg_mount(rtg);
906 
907 	trace_xfs_zone_reset(rtg);
908 
909 	ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
910 
911 	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ZONE_RESET)) {
912 		bio_io_error(bio);
913 		return;
914 	}
915 
916 	XFS_STATS_INC(mp, xs_gc_zone_reset_calls);
917 
918 	bio->bi_iter.bi_sector = xfs_gbno_to_daddr(rtg_group(rtg), 0);
919 	if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
920 		/*
921 		 * Also use the bio to drive the state machine when neither
922 		 * zone reset nor discard is supported to keep things simple.
923 		 */
924 		if (!bdev_max_discard_sectors(bio->bi_bdev)) {
925 			bio_endio(bio);
926 			return;
927 		}
928 		bio->bi_opf &= ~REQ_OP_ZONE_RESET;
929 		bio->bi_opf |= REQ_OP_DISCARD;
930 		bio->bi_iter.bi_size = XFS_FSB_TO_B(mp, rtg_blocks(rtg));
931 	}
932 
933 	submit_bio(bio);
934 }
935 
xfs_bio_wait_endio(struct bio * bio)936 static void xfs_bio_wait_endio(struct bio *bio)
937 {
938 	complete(bio->bi_private);
939 }
940 
941 int
xfs_zone_gc_reset_sync(struct xfs_rtgroup * rtg)942 xfs_zone_gc_reset_sync(
943 	struct xfs_rtgroup	*rtg)
944 {
945 	DECLARE_COMPLETION_ONSTACK(done);
946 	struct bio		bio;
947 	int			error;
948 
949 	bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
950 			REQ_OP_ZONE_RESET | REQ_SYNC);
951 	bio.bi_private = &done;
952 	bio.bi_end_io = xfs_bio_wait_endio;
953 	xfs_submit_zone_reset_bio(rtg, &bio);
954 	wait_for_completion_io(&done);
955 
956 	error = blk_status_to_errno(bio.bi_status);
957 	bio_uninit(&bio);
958 	return error;
959 }
960 
961 static void
xfs_zone_gc_reset_zones(struct xfs_zone_gc_data * data,struct xfs_group * reset_list)962 xfs_zone_gc_reset_zones(
963 	struct xfs_zone_gc_data	*data,
964 	struct xfs_group	*reset_list)
965 {
966 	struct xfs_group	*next = reset_list;
967 
968 	if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
969 		xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
970 		return;
971 	}
972 
973 	do {
974 		struct xfs_rtgroup	*rtg = to_rtg(next);
975 		struct xfs_gc_bio	*chunk;
976 		struct bio		*bio;
977 
978 		xfs_log_force_inode(rtg_rmap(rtg));
979 
980 		next = rtg_group(rtg)->xg_next_reset;
981 		rtg_group(rtg)->xg_next_reset = NULL;
982 
983 		bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
984 				0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
985 		bio->bi_private = rtg;
986 		bio->bi_end_io = xfs_zone_gc_end_io;
987 
988 		chunk = container_of(bio, struct xfs_gc_bio, bio);
989 		chunk->data = data;
990 		WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
991 		list_add_tail(&chunk->entry, &data->resetting);
992 		xfs_submit_zone_reset_bio(rtg, bio);
993 	} while (next);
994 }
995 
996 static bool
xfs_zone_gc_should_start_new_work(struct xfs_zone_gc_data * data)997 xfs_zone_gc_should_start_new_work(
998 	struct xfs_zone_gc_data	*data)
999 {
1000 	struct xfs_open_zone	*oz;
1001 
1002 	if (xfs_is_shutdown(data->mp))
1003 		return false;
1004 	if (!data->scratch_available)
1005 		return false;
1006 
1007 	oz = xfs_zone_gc_ensure_target(data->mp);
1008 	if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
1009 		return false;
1010 
1011 	if (!data->iter.victim_rtg) {
1012 		if (kthread_should_stop() || kthread_should_park())
1013 			return false;
1014 		if (!xfs_zoned_need_gc(data->mp))
1015 			return false;
1016 		if (!xfs_zone_gc_select_victim(data))
1017 			return false;
1018 	}
1019 
1020 	return true;
1021 }
1022 
1023 /*
1024  * Handle the work to read and write data for GC and to reset the zones,
1025  * including handling all completions.
1026  *
1027  * Note that the order of the chunks is preserved so that we don't undo the
1028  * optimal order established by xfs_zone_gc_query().
1029  */
1030 static void
xfs_zone_gc_handle_work(struct xfs_zone_gc_data * data)1031 xfs_zone_gc_handle_work(
1032 	struct xfs_zone_gc_data	*data)
1033 {
1034 	struct xfs_zone_info	*zi = data->mp->m_zone_info;
1035 	struct xfs_gc_bio	*chunk, *next;
1036 	struct xfs_group	*reset_list;
1037 	struct blk_plug		plug;
1038 
1039 	spin_lock(&zi->zi_reset_list_lock);
1040 	reset_list = zi->zi_reset_list;
1041 	zi->zi_reset_list = NULL;
1042 	spin_unlock(&zi->zi_reset_list_lock);
1043 
1044 	if (reset_list) {
1045 		set_current_state(TASK_RUNNING);
1046 		xfs_zone_gc_reset_zones(data, reset_list);
1047 	}
1048 
1049 	list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
1050 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1051 			break;
1052 		set_current_state(TASK_RUNNING);
1053 		xfs_zone_gc_finish_reset(chunk);
1054 	}
1055 
1056 	list_for_each_entry_safe(chunk, next, &data->writing, entry) {
1057 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1058 			break;
1059 		set_current_state(TASK_RUNNING);
1060 		xfs_zone_gc_finish_chunk(chunk);
1061 	}
1062 
1063 	blk_start_plug(&plug);
1064 	list_for_each_entry_safe(chunk, next, &data->reading, entry) {
1065 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1066 			break;
1067 		set_current_state(TASK_RUNNING);
1068 		xfs_zone_gc_write_chunk(chunk);
1069 	}
1070 	blk_finish_plug(&plug);
1071 
1072 	if (xfs_zone_gc_should_start_new_work(data)) {
1073 		set_current_state(TASK_RUNNING);
1074 		blk_start_plug(&plug);
1075 		while (xfs_zone_gc_start_chunk(data))
1076 			;
1077 		blk_finish_plug(&plug);
1078 	}
1079 }
1080 
1081 /*
1082  * Note that the current GC algorithm would break reflinks and thus duplicate
1083  * data that was shared by multiple owners before.  Because of that reflinks
1084  * are currently not supported on zoned file systems and can't be created or
1085  * mounted.
1086  */
1087 static int
xfs_zoned_gcd(void * private)1088 xfs_zoned_gcd(
1089 	void			*private)
1090 {
1091 	struct xfs_zone_gc_data	*data = private;
1092 	struct xfs_mount	*mp = data->mp;
1093 	struct xfs_zone_info	*zi = mp->m_zone_info;
1094 	unsigned int		nofs_flag;
1095 
1096 	nofs_flag = memalloc_nofs_save();
1097 	set_freezable();
1098 
1099 	for (;;) {
1100 		set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1101 		xfs_set_zonegc_running(mp);
1102 
1103 		xfs_zone_gc_handle_work(data);
1104 
1105 		/*
1106 		 * Only sleep if nothing set the state to running.  Else check for
1107 		 * work again as someone might have queued up more work and woken
1108 		 * us in the meantime.
1109 		 */
1110 		if (get_current_state() == TASK_RUNNING) {
1111 			try_to_freeze();
1112 			continue;
1113 		}
1114 
1115 		if (list_empty(&data->reading) &&
1116 		    list_empty(&data->writing) &&
1117 		    list_empty(&data->resetting) &&
1118 		    !zi->zi_reset_list) {
1119 			xfs_clear_zonegc_running(mp);
1120 			xfs_zoned_resv_wake_all(mp);
1121 
1122 			if (kthread_should_stop()) {
1123 				__set_current_state(TASK_RUNNING);
1124 				break;
1125 			}
1126 
1127 			if (kthread_should_park()) {
1128 				__set_current_state(TASK_RUNNING);
1129 				kthread_parkme();
1130 				continue;
1131 			}
1132 		}
1133 
1134 		schedule();
1135 	}
1136 	xfs_clear_zonegc_running(mp);
1137 
1138 	if (data->iter.victim_rtg)
1139 		xfs_rtgroup_rele(data->iter.victim_rtg);
1140 
1141 	memalloc_nofs_restore(nofs_flag);
1142 	xfs_zone_gc_data_free(data);
1143 	return 0;
1144 }
1145 
1146 void
xfs_zone_gc_start(struct xfs_mount * mp)1147 xfs_zone_gc_start(
1148 	struct xfs_mount	*mp)
1149 {
1150 	if (xfs_has_zoned(mp))
1151 		kthread_unpark(mp->m_zone_info->zi_gc_thread);
1152 }
1153 
1154 void
xfs_zone_gc_stop(struct xfs_mount * mp)1155 xfs_zone_gc_stop(
1156 	struct xfs_mount	*mp)
1157 {
1158 	if (xfs_has_zoned(mp))
1159 		kthread_park(mp->m_zone_info->zi_gc_thread);
1160 }
1161 
1162 int
xfs_zone_gc_mount(struct xfs_mount * mp)1163 xfs_zone_gc_mount(
1164 	struct xfs_mount	*mp)
1165 {
1166 	struct xfs_zone_info	*zi = mp->m_zone_info;
1167 	struct xfs_zone_gc_data	*data;
1168 	struct xfs_open_zone	*oz;
1169 	int			error;
1170 
1171 	/*
1172 	 * If there are no free zones available for GC, pick the open zone with
1173 	 * the least used space to GC into.  This should only happen after an
1174 	 * unclean shutdown near ENOSPC while GC was ongoing.
1175 	 *
1176 	 * We also need to do this for the first gc zone allocation if we
1177 	 * unmounted while at the open limit.
1178 	 */
1179 	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
1180 	    zi->zi_nr_open_zones == mp->m_max_open_zones)
1181 		oz = xfs_zone_gc_steal_open(zi);
1182 	else
1183 		oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
1184 	if (!oz) {
1185 		xfs_warn(mp, "unable to allocate a zone for gc");
1186 		error = -EIO;
1187 		goto out;
1188 	}
1189 
1190 	trace_xfs_zone_gc_target_opened(oz->oz_rtg);
1191 	zi->zi_open_gc_zone = oz;
1192 
1193 	data = xfs_zone_gc_data_alloc(mp);
1194 	if (!data) {
1195 		error = -ENOMEM;
1196 		goto out_put_gc_zone;
1197 	}
1198 
1199 	zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
1200 			"xfs-zone-gc/%s", mp->m_super->s_id);
1201 	if (IS_ERR(zi->zi_gc_thread)) {
1202 		xfs_warn(mp, "unable to create zone gc thread");
1203 		error = PTR_ERR(zi->zi_gc_thread);
1204 		goto out_free_gc_data;
1205 	}
1206 
1207 	/* xfs_zone_gc_start will unpark for rw mounts */
1208 	kthread_park(zi->zi_gc_thread);
1209 	return 0;
1210 
1211 out_free_gc_data:
1212 	kfree(data);
1213 out_put_gc_zone:
1214 	xfs_open_zone_put(zi->zi_open_gc_zone);
1215 out:
1216 	return error;
1217 }
1218 
1219 void
xfs_zone_gc_unmount(struct xfs_mount * mp)1220 xfs_zone_gc_unmount(
1221 	struct xfs_mount	*mp)
1222 {
1223 	struct xfs_zone_info	*zi = mp->m_zone_info;
1224 
1225 	kthread_stop(zi->zi_gc_thread);
1226 	if (zi->zi_open_gc_zone)
1227 		xfs_open_zone_put(zi->zi_open_gc_zone);
1228 }
1229