xref: /linux/fs/xfs/xfs_zone_gc.c (revision 6f7e6393d1ce636bb7ec77a7fe7b77458fddf701)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2023-2025 Christoph Hellwig.
4  * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5  */
6 #include "xfs_platform.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "xfs_trans.h"
15 #include "xfs_icache.h"
16 #include "xfs_rmap.h"
17 #include "xfs_rtbitmap.h"
18 #include "xfs_rtrmap_btree.h"
19 #include "xfs_errortag.h"
20 #include "xfs_error.h"
21 #include "xfs_zone_alloc.h"
22 #include "xfs_zone_priv.h"
23 #include "xfs_zones.h"
24 #include "xfs_trace.h"
25 
26 /*
27  * Implement Garbage Collection (GC) of partially used zoned.
28  *
29  * To support the purely sequential writes in each zone, zoned XFS needs to be
30  * able to move data remaining in a zone out of it to reset the zone to prepare
31  * for writing to it again.
32  *
33  * This is done by the GC thread implemented in this file.  To support that a
34  * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
35  * write the garbage collected data into.
36  *
37  * Whenever the available space is below the chosen threshold, the GC thread
38  * looks for potential non-empty but not fully used zones that are worth
39  * reclaiming.  Once found the rmap for the victim zone is queried, and after
40  * a bit of sorting to reduce fragmentation, the still live extents are read
41  * into memory and written to the GC target zone, and the bmap btree of the
42  * files is updated to point to the new location.  To avoid taking the IOLOCK
43  * and MMAPLOCK for the entire GC process and thus affecting the latency of
44  * user reads and writes to the files, the GC writes are speculative and the
45  * I/O completion checks that no other writes happened for the affected regions
46  * before remapping.
47  *
48  * Once a zone does not contain any valid data, be that through GC or user
49  * block removal, it is queued for for a zone reset.  The reset operation
50  * carefully ensures that the RT device cache is flushed and all transactions
51  * referencing the rmap have been committed to disk.
52  */
53 
54 /*
55  * Size of each GC scratch allocation, and the number of buffers.
56  */
57 #define XFS_GC_BUF_SIZE		SZ_1M
58 #define XFS_GC_NR_BUFS		2
59 static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS);
60 
61 /*
62  * Chunk that is read and written for each GC operation.
63  *
64  * Note that for writes to actual zoned devices, the chunk can be split when
65  * reaching the hardware limit.
66  */
67 struct xfs_gc_bio {
68 	struct xfs_zone_gc_data		*data;
69 
70 	/*
71 	 * Entry into the reading/writing/resetting list.  Only accessed from
72 	 * the GC thread, so no locking needed.
73 	 */
74 	struct list_head		entry;
75 
76 	/*
77 	 * State of this gc_bio.  Done means the current I/O completed.
78 	 * Set from the bio end I/O handler, read from the GC thread.
79 	 */
80 	enum {
81 		XFS_GC_BIO_NEW,
82 		XFS_GC_BIO_DONE,
83 	} state;
84 
85 	/*
86 	 * Pointer to the inode and byte range in the inode that this
87 	 * GC chunk is operating on.
88 	 */
89 	struct xfs_inode		*ip;
90 	loff_t				offset;
91 	unsigned int			len;
92 
93 	/*
94 	 * Existing startblock (in the zone to be freed) and newly assigned
95 	 * daddr in the zone GCed into.
96 	 */
97 	xfs_fsblock_t			old_startblock;
98 	xfs_daddr_t			new_daddr;
99 	struct xfs_zone_scratch		*scratch;
100 
101 	/* Are we writing to a sequential write required zone? */
102 	bool				is_seq;
103 
104 	/* Open Zone being written to */
105 	struct xfs_open_zone		*oz;
106 
107 	struct xfs_rtgroup		*victim_rtg;
108 
109 	/* Bio used for reads and writes, including the bvec used by it */
110 	struct bio			bio;	/* must be last */
111 };
112 
113 #define XFS_ZONE_GC_RECS		1024
114 
115 /* iterator, needs to be reinitialized for each victim zone */
116 struct xfs_zone_gc_iter {
117 	struct xfs_rtgroup		*victim_rtg;
118 	unsigned int			rec_count;
119 	unsigned int			rec_idx;
120 	xfs_agblock_t			next_startblock;
121 	struct xfs_rmap_irec		*recs;
122 };
123 
124 /*
125  * Per-mount GC state.
126  */
127 struct xfs_zone_gc_data {
128 	struct xfs_mount		*mp;
129 
130 	/* bioset used to allocate the gc_bios */
131 	struct bio_set			bio_set;
132 
133 	/*
134 	 * Scratchpad to buffer GC data, organized as a ring buffer over
135 	 * discontiguous folios.  scratch_head is where the buffer is filled,
136 	 * scratch_tail tracks the buffer space freed, and scratch_available
137 	 * counts the space available in the ring buffer between the head and
138 	 * the tail.
139 	 */
140 	struct folio			*scratch_folios[XFS_GC_NR_BUFS];
141 	unsigned int			scratch_size;
142 	unsigned int			scratch_available;
143 	unsigned int			scratch_head;
144 	unsigned int			scratch_tail;
145 
146 	/*
147 	 * List of bios currently being read, written and reset.
148 	 * These lists are only accessed by the GC thread itself, and must only
149 	 * be processed in order.
150 	 */
151 	struct list_head		reading;
152 	struct list_head		writing;
153 	struct list_head		resetting;
154 
155 	/*
156 	 * Iterator for the victim zone.
157 	 */
158 	struct xfs_zone_gc_iter		iter;
159 };
160 
161 /*
162  * We aim to keep enough zones free in stock to fully use the open zone limit
163  * for data placement purposes. Additionally, the m_zonegc_low_space tunable
164  * can be set to make sure a fraction of the unused blocks are available for
165  * writing.
166  */
167 bool
168 xfs_zoned_need_gc(
169 	struct xfs_mount	*mp)
170 {
171 	s64			available, free, threshold;
172 	s32			remainder;
173 
174 	if (!xfs_zoned_have_reclaimable(mp->m_zone_info))
175 		return false;
176 
177 	available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
178 
179 	if (available <
180 	    xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
181 		return true;
182 
183 	free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
184 
185 	threshold = div_s64_rem(free, 100, &remainder);
186 	threshold = threshold * mp->m_zonegc_low_space +
187 		    remainder * div_s64(mp->m_zonegc_low_space, 100);
188 
189 	if (available < threshold)
190 		return true;
191 
192 	return false;
193 }
194 
195 static struct xfs_zone_gc_data *
196 xfs_zone_gc_data_alloc(
197 	struct xfs_mount	*mp)
198 {
199 	struct xfs_zone_gc_data	*data;
200 	int			i;
201 
202 	data = kzalloc(sizeof(*data), GFP_KERNEL);
203 	if (!data)
204 		return NULL;
205 	data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
206 			GFP_KERNEL);
207 	if (!data->iter.recs)
208 		goto out_free_data;
209 
210 	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
211 			BIOSET_NEED_BVECS))
212 		goto out_free_recs;
213 	for (i = 0; i < XFS_GC_NR_BUFS; i++) {
214 		data->scratch_folios[i] =
215 			folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE));
216 		if (!data->scratch_folios[i])
217 			goto out_free_scratch;
218 	}
219 	data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS;
220 	data->scratch_available = data->scratch_size;
221 	INIT_LIST_HEAD(&data->reading);
222 	INIT_LIST_HEAD(&data->writing);
223 	INIT_LIST_HEAD(&data->resetting);
224 	data->mp = mp;
225 	return data;
226 
227 out_free_scratch:
228 	while (--i >= 0)
229 		folio_put(data->scratch_folios[i]);
230 	bioset_exit(&data->bio_set);
231 out_free_recs:
232 	kfree(data->iter.recs);
233 out_free_data:
234 	kfree(data);
235 	return NULL;
236 }
237 
238 static void
239 xfs_zone_gc_data_free(
240 	struct xfs_zone_gc_data	*data)
241 {
242 	int			i;
243 
244 	for (i = 0; i < XFS_GC_NR_BUFS; i++)
245 		folio_put(data->scratch_folios[i]);
246 	bioset_exit(&data->bio_set);
247 	kfree(data->iter.recs);
248 	kfree(data);
249 }
250 
251 static void
252 xfs_zone_gc_iter_init(
253 	struct xfs_zone_gc_iter	*iter,
254 	struct xfs_rtgroup	*victim_rtg)
255 
256 {
257 	iter->next_startblock = 0;
258 	iter->rec_count = 0;
259 	iter->rec_idx = 0;
260 	iter->victim_rtg = victim_rtg;
261 	atomic_inc(&victim_rtg->rtg_gccount);
262 }
263 
264 /*
265  * Query the rmap of the victim zone to gather the records to evacuate.
266  */
267 static int
268 xfs_zone_gc_query_cb(
269 	struct xfs_btree_cur	*cur,
270 	const struct xfs_rmap_irec *irec,
271 	void			*private)
272 {
273 	struct xfs_zone_gc_iter	*iter = private;
274 
275 	ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
276 	ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
277 	ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
278 
279 	iter->recs[iter->rec_count] = *irec;
280 	if (++iter->rec_count == XFS_ZONE_GC_RECS) {
281 		iter->next_startblock =
282 			irec->rm_startblock + irec->rm_blockcount;
283 		return 1;
284 	}
285 	return 0;
286 }
287 
288 static int
289 xfs_zone_gc_rmap_rec_cmp(
290 	const void			*a,
291 	const void			*b)
292 {
293 	const struct xfs_rmap_irec	*reca = a;
294 	const struct xfs_rmap_irec	*recb = b;
295 	int				diff;
296 
297 	diff = cmp_int(reca->rm_owner, recb->rm_owner);
298 	if (diff)
299 		return diff;
300 	return cmp_int(reca->rm_offset, recb->rm_offset);
301 }
302 
303 static int
304 xfs_zone_gc_query(
305 	struct xfs_mount	*mp,
306 	struct xfs_zone_gc_iter	*iter)
307 {
308 	struct xfs_rtgroup	*rtg = iter->victim_rtg;
309 	struct xfs_rmap_irec	ri_low = { };
310 	struct xfs_rmap_irec	ri_high;
311 	struct xfs_btree_cur	*cur;
312 	struct xfs_trans	*tp;
313 	int			error;
314 
315 	ASSERT(iter->next_startblock <= rtg_blocks(rtg));
316 	if (iter->next_startblock == rtg_blocks(rtg))
317 		goto done;
318 
319 	ASSERT(iter->next_startblock < rtg_blocks(rtg));
320 	ri_low.rm_startblock = iter->next_startblock;
321 	memset(&ri_high, 0xFF, sizeof(ri_high));
322 
323 	iter->rec_idx = 0;
324 	iter->rec_count = 0;
325 
326 	tp = xfs_trans_alloc_empty(mp);
327 	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
328 	cur = xfs_rtrmapbt_init_cursor(tp, rtg);
329 	error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
330 			xfs_zone_gc_query_cb, iter);
331 	xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
332 	xfs_btree_del_cursor(cur, error < 0 ? error : 0);
333 	xfs_trans_cancel(tp);
334 
335 	if (error < 0)
336 		return error;
337 
338 	/*
339 	 * Sort the rmap records by inode number and increasing offset to
340 	 * defragment the mappings.
341 	 *
342 	 * This could be further enhanced by an even bigger look ahead window,
343 	 * but that's better left until we have better detection of changes to
344 	 * inode mapping to avoid the potential of GCing already dead data.
345 	 */
346 	sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
347 			xfs_zone_gc_rmap_rec_cmp, NULL);
348 
349 	if (error == 0) {
350 		/*
351 		 * We finished iterating through the zone.
352 		 */
353 		iter->next_startblock = rtg_blocks(rtg);
354 		if (iter->rec_count == 0)
355 			goto done;
356 	}
357 
358 	return 0;
359 done:
360 	atomic_dec(&iter->victim_rtg->rtg_gccount);
361 	xfs_rtgroup_rele(iter->victim_rtg);
362 	iter->victim_rtg = NULL;
363 	return 0;
364 }
365 
366 static bool
367 xfs_zone_gc_iter_next(
368 	struct xfs_mount	*mp,
369 	struct xfs_zone_gc_iter	*iter,
370 	struct xfs_rmap_irec	*chunk_rec,
371 	struct xfs_inode	**ipp)
372 {
373 	struct xfs_rmap_irec	*irec;
374 	int			error;
375 
376 	if (!iter->victim_rtg)
377 		return false;
378 
379 retry:
380 	if (iter->rec_idx == iter->rec_count) {
381 		error = xfs_zone_gc_query(mp, iter);
382 		if (error)
383 			goto fail;
384 		if (!iter->victim_rtg)
385 			return false;
386 	}
387 
388 	irec = &iter->recs[iter->rec_idx];
389 	error = xfs_iget(mp, NULL, irec->rm_owner,
390 			XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
391 	if (error) {
392 		/*
393 		 * If the inode was already deleted, skip over it.
394 		 */
395 		if (error == -ENOENT) {
396 			iter->rec_idx++;
397 			goto retry;
398 		}
399 		goto fail;
400 	}
401 
402 	if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
403 		iter->rec_idx++;
404 		xfs_irele(*ipp);
405 		goto retry;
406 	}
407 
408 	*chunk_rec = *irec;
409 	return true;
410 
411 fail:
412 	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
413 	return false;
414 }
415 
416 static void
417 xfs_zone_gc_iter_advance(
418 	struct xfs_zone_gc_iter	*iter,
419 	xfs_extlen_t		count_fsb)
420 {
421 	struct xfs_rmap_irec	*irec = &iter->recs[iter->rec_idx];
422 
423 	irec->rm_offset += count_fsb;
424 	irec->rm_startblock += count_fsb;
425 	irec->rm_blockcount -= count_fsb;
426 	if (!irec->rm_blockcount)
427 		iter->rec_idx++;
428 }
429 
430 static struct xfs_rtgroup *
431 xfs_zone_gc_pick_victim_from(
432 	struct xfs_mount	*mp,
433 	uint32_t		bucket)
434 {
435 	struct xfs_zone_info	*zi = mp->m_zone_info;
436 	uint32_t		victim_used = U32_MAX;
437 	struct xfs_rtgroup	*victim_rtg = NULL;
438 	uint32_t		bit;
439 
440 	if (!zi->zi_used_bucket_entries[bucket])
441 		return NULL;
442 
443 	for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
444 			mp->m_sb.sb_rgcount) {
445 		struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
446 
447 		if (!rtg)
448 			continue;
449 
450 		/*
451 		 * If the zone is already undergoing GC, don't pick it again.
452 		 *
453 		 * This prevents us from picking one of the zones for which we
454 		 * already submitted GC I/O, but for which the remapping hasn't
455 		 * concluded yet.  This won't cause data corruption, but
456 		 * increases write amplification and slows down GC, so this is
457 		 * a bad thing.
458 		 */
459 		if (atomic_read(&rtg->rtg_gccount)) {
460 			xfs_rtgroup_rele(rtg);
461 			continue;
462 		}
463 
464 		/* skip zones that are just waiting for a reset */
465 		if (rtg_rmap(rtg)->i_used_blocks == 0 ||
466 		    rtg_rmap(rtg)->i_used_blocks >= victim_used) {
467 			xfs_rtgroup_rele(rtg);
468 			continue;
469 		}
470 
471 		if (victim_rtg)
472 			xfs_rtgroup_rele(victim_rtg);
473 		victim_rtg = rtg;
474 		victim_used = rtg_rmap(rtg)->i_used_blocks;
475 
476 		/*
477 		 * Any zone that is less than 1 percent used is fair game for
478 		 * instant reclaim. All of these zones are in the last
479 		 * bucket, so avoid the expensive division for the zones
480 		 * in the other buckets.
481 		 */
482 		if (bucket == 0 &&
483 		    rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
484 			break;
485 	}
486 
487 	return victim_rtg;
488 }
489 
490 /*
491  * Iterate through all zones marked as reclaimable and find a candidate to
492  * reclaim.
493  */
494 static bool
495 xfs_zone_gc_select_victim(
496 	struct xfs_zone_gc_data	*data)
497 {
498 	struct xfs_zone_gc_iter	*iter = &data->iter;
499 	struct xfs_mount	*mp = data->mp;
500 	struct xfs_zone_info	*zi = mp->m_zone_info;
501 	struct xfs_rtgroup	*victim_rtg = NULL;
502 	unsigned int		bucket;
503 
504 	spin_lock(&zi->zi_used_buckets_lock);
505 	for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
506 		victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
507 		if (victim_rtg)
508 			break;
509 	}
510 	spin_unlock(&zi->zi_used_buckets_lock);
511 
512 	if (!victim_rtg)
513 		return false;
514 
515 	trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
516 	xfs_zone_gc_iter_init(iter, victim_rtg);
517 	return true;
518 }
519 
520 static struct xfs_open_zone *
521 xfs_zone_gc_steal_open(
522 	struct xfs_zone_info	*zi)
523 {
524 	struct xfs_open_zone	*oz, *found = NULL;
525 
526 	spin_lock(&zi->zi_open_zones_lock);
527 	list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
528 		if (!found || oz->oz_allocated < found->oz_allocated)
529 			found = oz;
530 	}
531 
532 	if (found) {
533 		found->oz_is_gc = true;
534 		list_del_init(&found->oz_entry);
535 		zi->zi_nr_open_zones--;
536 	}
537 
538 	spin_unlock(&zi->zi_open_zones_lock);
539 	return found;
540 }
541 
542 static struct xfs_open_zone *
543 xfs_zone_gc_select_target(
544 	struct xfs_mount	*mp)
545 {
546 	struct xfs_zone_info	*zi = mp->m_zone_info;
547 	struct xfs_open_zone	*oz = zi->zi_open_gc_zone;
548 
549 	/*
550 	 * We need to wait for pending writes to finish.
551 	 */
552 	if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
553 		return NULL;
554 
555 	ASSERT(zi->zi_nr_open_zones <=
556 		mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
557 	oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
558 	if (oz)
559 		trace_xfs_zone_gc_target_opened(oz->oz_rtg);
560 	spin_lock(&zi->zi_open_zones_lock);
561 	zi->zi_open_gc_zone = oz;
562 	spin_unlock(&zi->zi_open_zones_lock);
563 	return oz;
564 }
565 
566 /*
567  * Ensure we have a valid open zone to write the GC data to.
568  *
569  * If the current target zone has space keep writing to it, else first wait for
570  * all pending writes and then pick a new one.
571  */
572 static struct xfs_open_zone *
573 xfs_zone_gc_ensure_target(
574 	struct xfs_mount	*mp)
575 {
576 	struct xfs_open_zone	*oz = mp->m_zone_info->zi_open_gc_zone;
577 
578 	if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
579 		return xfs_zone_gc_select_target(mp);
580 	return oz;
581 }
582 
583 static void
584 xfs_zone_gc_end_io(
585 	struct bio		*bio)
586 {
587 	struct xfs_gc_bio	*chunk =
588 		container_of(bio, struct xfs_gc_bio, bio);
589 	struct xfs_zone_gc_data	*data = chunk->data;
590 
591 	WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
592 	wake_up_process(data->mp->m_zone_info->zi_gc_thread);
593 }
594 
595 static struct xfs_open_zone *
596 xfs_zone_gc_alloc_blocks(
597 	struct xfs_zone_gc_data	*data,
598 	xfs_extlen_t		*count_fsb,
599 	xfs_daddr_t		*daddr,
600 	bool			*is_seq)
601 {
602 	struct xfs_mount	*mp = data->mp;
603 	struct xfs_open_zone	*oz;
604 
605 	oz = xfs_zone_gc_ensure_target(mp);
606 	if (!oz)
607 		return NULL;
608 
609 	*count_fsb = min(*count_fsb, XFS_B_TO_FSB(mp, data->scratch_available));
610 
611 	/*
612 	 * Directly allocate GC blocks from the reserved pool.
613 	 *
614 	 * If we'd take them from the normal pool we could be stealing blocks
615 	 * from a regular writer, which would then have to wait for GC and
616 	 * deadlock.
617 	 */
618 	spin_lock(&mp->m_sb_lock);
619 	*count_fsb = min(*count_fsb,
620 			rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
621 	*count_fsb = min3(*count_fsb,
622 			mp->m_free[XC_FREE_RTEXTENTS].res_avail,
623 			mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
624 	mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
625 	mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
626 	spin_unlock(&mp->m_sb_lock);
627 
628 	if (!*count_fsb)
629 		return NULL;
630 
631 	*daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
632 	*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
633 	if (!*is_seq)
634 		*daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
635 	oz->oz_allocated += *count_fsb;
636 	atomic_inc(&oz->oz_ref);
637 	return oz;
638 }
639 
640 static void
641 xfs_zone_gc_add_data(
642 	struct xfs_gc_bio	*chunk)
643 {
644 	struct xfs_zone_gc_data	*data = chunk->data;
645 	unsigned int		len = chunk->len;
646 	unsigned int		off = data->scratch_head;
647 
648 	do {
649 		unsigned int	this_off = off % XFS_GC_BUF_SIZE;
650 		unsigned int	this_len = min(len, XFS_GC_BUF_SIZE - this_off);
651 
652 		bio_add_folio_nofail(&chunk->bio,
653 				data->scratch_folios[off / XFS_GC_BUF_SIZE],
654 				this_len, this_off);
655 		len -= this_len;
656 		off += this_len;
657 		if (off == data->scratch_size)
658 			off = 0;
659 	} while (len);
660 }
661 
662 static bool
663 xfs_zone_gc_start_chunk(
664 	struct xfs_zone_gc_data	*data)
665 {
666 	struct xfs_zone_gc_iter	*iter = &data->iter;
667 	struct xfs_mount	*mp = data->mp;
668 	struct block_device	*bdev = mp->m_rtdev_targp->bt_bdev;
669 	struct xfs_open_zone	*oz;
670 	struct xfs_rmap_irec	irec;
671 	struct xfs_gc_bio	*chunk;
672 	struct xfs_inode	*ip;
673 	struct bio		*bio;
674 	xfs_daddr_t		daddr;
675 	unsigned int		len;
676 	bool			is_seq;
677 
678 	if (xfs_is_shutdown(mp))
679 		return false;
680 
681 	if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
682 		return false;
683 	oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
684 			&is_seq);
685 	if (!oz) {
686 		xfs_irele(ip);
687 		return false;
688 	}
689 
690 	len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
691 	bio = bio_alloc_bioset(bdev,
692 			min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS),
693 			REQ_OP_READ, GFP_NOFS, &data->bio_set);
694 
695 	chunk = container_of(bio, struct xfs_gc_bio, bio);
696 	chunk->ip = ip;
697 	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
698 	chunk->len = len;
699 	chunk->old_startblock =
700 		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
701 	chunk->new_daddr = daddr;
702 	chunk->is_seq = is_seq;
703 	chunk->data = data;
704 	chunk->oz = oz;
705 	chunk->victim_rtg = iter->victim_rtg;
706 	atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
707 	atomic_inc(&chunk->victim_rtg->rtg_gccount);
708 
709 	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
710 	bio->bi_end_io = xfs_zone_gc_end_io;
711 	xfs_zone_gc_add_data(chunk);
712 	data->scratch_head = (data->scratch_head + len) % data->scratch_size;
713 	data->scratch_available -= len;
714 
715 	XFS_STATS_INC(mp, xs_gc_read_calls);
716 
717 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
718 	list_add_tail(&chunk->entry, &data->reading);
719 	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
720 
721 	submit_bio(bio);
722 	return true;
723 }
724 
725 static void
726 xfs_zone_gc_free_chunk(
727 	struct xfs_gc_bio	*chunk)
728 {
729 	atomic_dec(&chunk->victim_rtg->rtg_gccount);
730 	xfs_rtgroup_rele(chunk->victim_rtg);
731 	list_del(&chunk->entry);
732 	xfs_open_zone_put(chunk->oz);
733 	xfs_irele(chunk->ip);
734 	bio_put(&chunk->bio);
735 }
736 
737 static void
738 xfs_zone_gc_submit_write(
739 	struct xfs_zone_gc_data	*data,
740 	struct xfs_gc_bio	*chunk)
741 {
742 	if (chunk->is_seq) {
743 		chunk->bio.bi_opf &= ~REQ_OP_WRITE;
744 		chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
745 	}
746 	chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
747 	chunk->bio.bi_end_io = xfs_zone_gc_end_io;
748 	submit_bio(&chunk->bio);
749 }
750 
751 static struct xfs_gc_bio *
752 xfs_zone_gc_split_write(
753 	struct xfs_zone_gc_data	*data,
754 	struct xfs_gc_bio	*chunk)
755 {
756 	struct queue_limits	*lim =
757 		&bdev_get_queue(chunk->bio.bi_bdev)->limits;
758 	struct xfs_gc_bio	*split_chunk;
759 	int			split_sectors;
760 	unsigned int		split_len;
761 	struct bio		*split;
762 	unsigned int		nsegs;
763 
764 	if (!chunk->is_seq)
765 		return NULL;
766 
767 	split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
768 			lim->max_zone_append_sectors << SECTOR_SHIFT);
769 	if (!split_sectors)
770 		return NULL;
771 
772 	/* ensure the split chunk is still block size aligned */
773 	split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
774 			data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
775 	split_len = split_sectors << SECTOR_SHIFT;
776 
777 	split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
778 	split_chunk = container_of(split, struct xfs_gc_bio, bio);
779 	split_chunk->data = data;
780 	ihold(VFS_I(chunk->ip));
781 	split_chunk->ip = chunk->ip;
782 	split_chunk->is_seq = chunk->is_seq;
783 	split_chunk->scratch = chunk->scratch;
784 	split_chunk->offset = chunk->offset;
785 	split_chunk->len = split_len;
786 	split_chunk->old_startblock = chunk->old_startblock;
787 	split_chunk->new_daddr = chunk->new_daddr;
788 	split_chunk->oz = chunk->oz;
789 	atomic_inc(&chunk->oz->oz_ref);
790 
791 	split_chunk->victim_rtg = chunk->victim_rtg;
792 	atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
793 	atomic_inc(&chunk->victim_rtg->rtg_gccount);
794 
795 	chunk->offset += split_len;
796 	chunk->len -= split_len;
797 	chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
798 
799 	/* add right before the original chunk */
800 	WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
801 	list_add_tail(&split_chunk->entry, &chunk->entry);
802 	return split_chunk;
803 }
804 
805 static void
806 xfs_zone_gc_write_chunk(
807 	struct xfs_gc_bio	*chunk)
808 {
809 	struct xfs_zone_gc_data	*data = chunk->data;
810 	struct xfs_mount	*mp = chunk->ip->i_mount;
811 	struct xfs_gc_bio	*split_chunk;
812 
813 	if (chunk->bio.bi_status)
814 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
815 	if (xfs_is_shutdown(mp)) {
816 		xfs_zone_gc_free_chunk(chunk);
817 		return;
818 	}
819 
820 	XFS_STATS_INC(mp, xs_gc_write_calls);
821 	XFS_STATS_ADD(mp, xs_gc_bytes, chunk->len);
822 
823 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
824 	list_move_tail(&chunk->entry, &data->writing);
825 
826 	bio_reuse(&chunk->bio, REQ_OP_WRITE);
827 	while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
828 		xfs_zone_gc_submit_write(data, split_chunk);
829 	xfs_zone_gc_submit_write(data, chunk);
830 }
831 
832 static void
833 xfs_zone_gc_finish_chunk(
834 	struct xfs_gc_bio	*chunk)
835 {
836 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
837 	struct xfs_zone_gc_data	*data = chunk->data;
838 	struct xfs_inode	*ip = chunk->ip;
839 	struct xfs_mount	*mp = ip->i_mount;
840 	int			error;
841 
842 	if (chunk->bio.bi_status)
843 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
844 	if (xfs_is_shutdown(mp)) {
845 		xfs_zone_gc_free_chunk(chunk);
846 		return;
847 	}
848 
849 	data->scratch_tail =
850 		(data->scratch_tail + chunk->len) % data->scratch_size;
851 	data->scratch_available += chunk->len;
852 
853 	/*
854 	 * Cycle through the iolock and wait for direct I/O and layouts to
855 	 * ensure no one is reading from the old mapping before it goes away.
856 	 *
857 	 * Note that xfs_zoned_end_io() below checks that no other writer raced
858 	 * with us to update the mapping by checking that the old startblock
859 	 * didn't change.
860 	 */
861 	xfs_ilock(ip, iolock);
862 	error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
863 	if (!error)
864 		inode_dio_wait(VFS_I(ip));
865 	xfs_iunlock(ip, iolock);
866 	if (error)
867 		goto free;
868 
869 	if (chunk->is_seq)
870 		chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
871 	error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
872 			chunk->new_daddr, chunk->oz, chunk->old_startblock);
873 free:
874 	if (error)
875 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
876 	xfs_zone_gc_free_chunk(chunk);
877 }
878 
879 static void
880 xfs_zone_gc_finish_reset(
881 	struct xfs_gc_bio	*chunk)
882 {
883 	struct xfs_rtgroup	*rtg = chunk->bio.bi_private;
884 	struct xfs_mount	*mp = rtg_mount(rtg);
885 	struct xfs_zone_info	*zi = mp->m_zone_info;
886 
887 	if (chunk->bio.bi_status) {
888 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
889 		goto out;
890 	}
891 
892 	xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
893 	atomic_inc(&zi->zi_nr_free_zones);
894 
895 	xfs_zoned_add_available(mp, rtg_blocks(rtg));
896 
897 	wake_up_all(&zi->zi_zone_wait);
898 out:
899 	list_del(&chunk->entry);
900 	bio_put(&chunk->bio);
901 }
902 
903 static void
904 xfs_submit_zone_reset_bio(
905 	struct xfs_rtgroup	*rtg,
906 	struct bio		*bio)
907 {
908 	struct xfs_mount	*mp = rtg_mount(rtg);
909 
910 	trace_xfs_zone_reset(rtg);
911 
912 	ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
913 
914 	if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ZONE_RESET)) {
915 		bio_io_error(bio);
916 		return;
917 	}
918 
919 	XFS_STATS_INC(mp, xs_gc_zone_reset_calls);
920 
921 	bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
922 	if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
923 		/*
924 		 * Also use the bio to drive the state machine when neither
925 		 * zone reset nor discard is supported to keep things simple.
926 		 */
927 		if (!bdev_max_discard_sectors(bio->bi_bdev)) {
928 			bio_endio(bio);
929 			return;
930 		}
931 		bio->bi_opf &= ~REQ_OP_ZONE_RESET;
932 		bio->bi_opf |= REQ_OP_DISCARD;
933 		bio->bi_iter.bi_size = XFS_FSB_TO_B(mp, rtg_blocks(rtg));
934 	}
935 
936 	submit_bio(bio);
937 }
938 
939 static void xfs_bio_wait_endio(struct bio *bio)
940 {
941 	complete(bio->bi_private);
942 }
943 
944 int
945 xfs_zone_gc_reset_sync(
946 	struct xfs_rtgroup	*rtg)
947 {
948 	DECLARE_COMPLETION_ONSTACK(done);
949 	struct bio		bio;
950 	int			error;
951 
952 	bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
953 			REQ_OP_ZONE_RESET | REQ_SYNC);
954 	bio.bi_private = &done;
955 	bio.bi_end_io = xfs_bio_wait_endio;
956 	xfs_submit_zone_reset_bio(rtg, &bio);
957 	wait_for_completion_io(&done);
958 
959 	error = blk_status_to_errno(bio.bi_status);
960 	bio_uninit(&bio);
961 	return error;
962 }
963 
964 static void
965 xfs_zone_gc_reset_zones(
966 	struct xfs_zone_gc_data	*data,
967 	struct xfs_group	*reset_list)
968 {
969 	struct xfs_group	*next = reset_list;
970 
971 	if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
972 		xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
973 		return;
974 	}
975 
976 	do {
977 		struct xfs_rtgroup	*rtg = to_rtg(next);
978 		struct xfs_gc_bio	*chunk;
979 		struct bio		*bio;
980 
981 		xfs_log_force_inode(rtg_rmap(rtg));
982 
983 		next = rtg_group(rtg)->xg_next_reset;
984 		rtg_group(rtg)->xg_next_reset = NULL;
985 
986 		bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
987 				0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
988 		bio->bi_private = rtg;
989 		bio->bi_end_io = xfs_zone_gc_end_io;
990 
991 		chunk = container_of(bio, struct xfs_gc_bio, bio);
992 		chunk->data = data;
993 		WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
994 		list_add_tail(&chunk->entry, &data->resetting);
995 		xfs_submit_zone_reset_bio(rtg, bio);
996 	} while (next);
997 }
998 
999 static bool
1000 xfs_zone_gc_should_start_new_work(
1001 	struct xfs_zone_gc_data	*data)
1002 {
1003 	struct xfs_open_zone	*oz;
1004 
1005 	if (xfs_is_shutdown(data->mp))
1006 		return false;
1007 	if (!data->scratch_available)
1008 		return false;
1009 
1010 	oz = xfs_zone_gc_ensure_target(data->mp);
1011 	if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
1012 		return false;
1013 
1014 	if (!data->iter.victim_rtg) {
1015 		if (kthread_should_stop() || kthread_should_park())
1016 			return false;
1017 		if (!xfs_zoned_need_gc(data->mp))
1018 			return false;
1019 		if (!xfs_zone_gc_select_victim(data))
1020 			return false;
1021 	}
1022 
1023 	return true;
1024 }
1025 
1026 /*
1027  * Handle the work to read and write data for GC and to reset the zones,
1028  * including handling all completions.
1029  *
1030  * Note that the order of the chunks is preserved so that we don't undo the
1031  * optimal order established by xfs_zone_gc_query().
1032  */
1033 static void
1034 xfs_zone_gc_handle_work(
1035 	struct xfs_zone_gc_data	*data)
1036 {
1037 	struct xfs_zone_info	*zi = data->mp->m_zone_info;
1038 	struct xfs_gc_bio	*chunk, *next;
1039 	struct xfs_group	*reset_list;
1040 	struct blk_plug		plug;
1041 
1042 	spin_lock(&zi->zi_reset_list_lock);
1043 	reset_list = zi->zi_reset_list;
1044 	zi->zi_reset_list = NULL;
1045 	spin_unlock(&zi->zi_reset_list_lock);
1046 
1047 	if (reset_list) {
1048 		set_current_state(TASK_RUNNING);
1049 		xfs_zone_gc_reset_zones(data, reset_list);
1050 	}
1051 
1052 	list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
1053 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1054 			break;
1055 		set_current_state(TASK_RUNNING);
1056 		xfs_zone_gc_finish_reset(chunk);
1057 	}
1058 
1059 	list_for_each_entry_safe(chunk, next, &data->writing, entry) {
1060 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1061 			break;
1062 		set_current_state(TASK_RUNNING);
1063 		xfs_zone_gc_finish_chunk(chunk);
1064 	}
1065 
1066 	blk_start_plug(&plug);
1067 	list_for_each_entry_safe(chunk, next, &data->reading, entry) {
1068 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1069 			break;
1070 		set_current_state(TASK_RUNNING);
1071 		xfs_zone_gc_write_chunk(chunk);
1072 	}
1073 	blk_finish_plug(&plug);
1074 
1075 	if (xfs_zone_gc_should_start_new_work(data)) {
1076 		set_current_state(TASK_RUNNING);
1077 		blk_start_plug(&plug);
1078 		while (xfs_zone_gc_start_chunk(data))
1079 			;
1080 		blk_finish_plug(&plug);
1081 	}
1082 }
1083 
1084 /*
1085  * Note that the current GC algorithm would break reflinks and thus duplicate
1086  * data that was shared by multiple owners before.  Because of that reflinks
1087  * are currently not supported on zoned file systems and can't be created or
1088  * mounted.
1089  */
1090 static int
1091 xfs_zoned_gcd(
1092 	void			*private)
1093 {
1094 	struct xfs_zone_gc_data	*data = private;
1095 	struct xfs_mount	*mp = data->mp;
1096 	struct xfs_zone_info	*zi = mp->m_zone_info;
1097 	unsigned int		nofs_flag;
1098 
1099 	nofs_flag = memalloc_nofs_save();
1100 	set_freezable();
1101 
1102 	for (;;) {
1103 		set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1104 		xfs_set_zonegc_running(mp);
1105 
1106 		xfs_zone_gc_handle_work(data);
1107 
1108 		/*
1109 		 * Only sleep if nothing set the state to running.  Else check for
1110 		 * work again as someone might have queued up more work and woken
1111 		 * us in the meantime.
1112 		 */
1113 		if (get_current_state() == TASK_RUNNING) {
1114 			try_to_freeze();
1115 			continue;
1116 		}
1117 
1118 		if (list_empty(&data->reading) &&
1119 		    list_empty(&data->writing) &&
1120 		    list_empty(&data->resetting) &&
1121 		    !zi->zi_reset_list) {
1122 			xfs_clear_zonegc_running(mp);
1123 			xfs_zoned_resv_wake_all(mp);
1124 
1125 			if (kthread_should_stop()) {
1126 				__set_current_state(TASK_RUNNING);
1127 				break;
1128 			}
1129 
1130 			if (kthread_should_park()) {
1131 				__set_current_state(TASK_RUNNING);
1132 				kthread_parkme();
1133 				continue;
1134 			}
1135 		}
1136 
1137 		schedule();
1138 	}
1139 	xfs_clear_zonegc_running(mp);
1140 
1141 	if (data->iter.victim_rtg)
1142 		xfs_rtgroup_rele(data->iter.victim_rtg);
1143 
1144 	memalloc_nofs_restore(nofs_flag);
1145 	xfs_zone_gc_data_free(data);
1146 	return 0;
1147 }
1148 
1149 void
1150 xfs_zone_gc_start(
1151 	struct xfs_mount	*mp)
1152 {
1153 	if (xfs_has_zoned(mp))
1154 		kthread_unpark(mp->m_zone_info->zi_gc_thread);
1155 }
1156 
1157 void
1158 xfs_zone_gc_stop(
1159 	struct xfs_mount	*mp)
1160 {
1161 	if (xfs_has_zoned(mp))
1162 		kthread_park(mp->m_zone_info->zi_gc_thread);
1163 }
1164 
1165 int
1166 xfs_zone_gc_mount(
1167 	struct xfs_mount	*mp)
1168 {
1169 	struct xfs_zone_info	*zi = mp->m_zone_info;
1170 	struct xfs_zone_gc_data	*data;
1171 	struct xfs_open_zone	*oz;
1172 	int			error;
1173 
1174 	/*
1175 	 * If there are no free zones available for GC, pick the open zone with
1176 	 * the least used space to GC into.  This should only happen after an
1177 	 * unclean shutdown near ENOSPC while GC was ongoing.
1178 	 *
1179 	 * We also need to do this for the first gc zone allocation if we
1180 	 * unmounted while at the open limit.
1181 	 */
1182 	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
1183 	    zi->zi_nr_open_zones == mp->m_max_open_zones)
1184 		oz = xfs_zone_gc_steal_open(zi);
1185 	else
1186 		oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
1187 	if (!oz) {
1188 		xfs_warn(mp, "unable to allocate a zone for gc");
1189 		error = -EIO;
1190 		goto out;
1191 	}
1192 
1193 	trace_xfs_zone_gc_target_opened(oz->oz_rtg);
1194 	zi->zi_open_gc_zone = oz;
1195 
1196 	data = xfs_zone_gc_data_alloc(mp);
1197 	if (!data) {
1198 		error = -ENOMEM;
1199 		goto out_put_gc_zone;
1200 	}
1201 
1202 	zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
1203 			"xfs-zone-gc/%s", mp->m_super->s_id);
1204 	if (IS_ERR(zi->zi_gc_thread)) {
1205 		xfs_warn(mp, "unable to create zone gc thread");
1206 		error = PTR_ERR(zi->zi_gc_thread);
1207 		goto out_free_gc_data;
1208 	}
1209 
1210 	/* xfs_zone_gc_start will unpark for rw mounts */
1211 	kthread_park(zi->zi_gc_thread);
1212 	return 0;
1213 
1214 out_free_gc_data:
1215 	kfree(data);
1216 out_put_gc_zone:
1217 	xfs_open_zone_put(zi->zi_open_gc_zone);
1218 out:
1219 	return error;
1220 }
1221 
1222 void
1223 xfs_zone_gc_unmount(
1224 	struct xfs_mount	*mp)
1225 {
1226 	struct xfs_zone_info	*zi = mp->m_zone_info;
1227 
1228 	kthread_stop(zi->zi_gc_thread);
1229 	if (zi->zi_open_gc_zone)
1230 		xfs_open_zone_put(zi->zi_open_gc_zone);
1231 }
1232