xref: /linux/fs/xfs/xfs_zone_gc.c (revision c148bc7535650fbfa95a1f571b9ffa2ab478ea33)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2023-2025 Christoph Hellwig.
4  * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5  */
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "xfs_trans.h"
15 #include "xfs_icache.h"
16 #include "xfs_rmap.h"
17 #include "xfs_rtbitmap.h"
18 #include "xfs_rtrmap_btree.h"
19 #include "xfs_zone_alloc.h"
20 #include "xfs_zone_priv.h"
21 #include "xfs_zones.h"
22 #include "xfs_trace.h"
23 
24 /*
25  * Implement Garbage Collection (GC) of partially used zoned.
26  *
27  * To support the purely sequential writes in each zone, zoned XFS needs to be
28  * able to move data remaining in a zone out of it to reset the zone to prepare
29  * for writing to it again.
30  *
31  * This is done by the GC thread implemented in this file.  To support that a
32  * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
33  * write the garbage collected data into.
34  *
35  * Whenever the available space is below the chosen threshold, the GC thread
36  * looks for potential non-empty but not fully used zones that are worth
37  * reclaiming.  Once found the rmap for the victim zone is queried, and after
38  * a bit of sorting to reduce fragmentation, the still live extents are read
39  * into memory and written to the GC target zone, and the bmap btree of the
40  * files is updated to point to the new location.  To avoid taking the IOLOCK
41  * and MMAPLOCK for the entire GC process and thus affecting the latency of
42  * user reads and writes to the files, the GC writes are speculative and the
43  * I/O completion checks that no other writes happened for the affected regions
44  * before remapping.
45  *
46  * Once a zone does not contain any valid data, be that through GC or user
47  * block removal, it is queued for for a zone reset.  The reset operation
48  * carefully ensures that the RT device cache is flushed and all transactions
49  * referencing the rmap have been committed to disk.
50  */
51 
52 /*
53  * Size of each GC scratch pad.  This is also the upper bound for each
54  * GC I/O, which helps to keep latency down.
55  */
56 #define XFS_GC_CHUNK_SIZE	SZ_1M
57 
58 /*
59  * Scratchpad data to read GCed data into.
60  *
61  * The offset member tracks where the next allocation starts, and freed tracks
62  * the amount of space that is not used anymore.
63  */
64 #define XFS_ZONE_GC_NR_SCRATCH	2
65 struct xfs_zone_scratch {
66 	struct folio			*folio;
67 	unsigned int			offset;
68 	unsigned int			freed;
69 };
70 
71 /*
72  * Chunk that is read and written for each GC operation.
73  *
74  * Note that for writes to actual zoned devices, the chunk can be split when
75  * reaching the hardware limit.
76  */
77 struct xfs_gc_bio {
78 	struct xfs_zone_gc_data		*data;
79 
80 	/*
81 	 * Entry into the reading/writing/resetting list.  Only accessed from
82 	 * the GC thread, so no locking needed.
83 	 */
84 	struct list_head		entry;
85 
86 	/*
87 	 * State of this gc_bio.  Done means the current I/O completed.
88 	 * Set from the bio end I/O handler, read from the GC thread.
89 	 */
90 	enum {
91 		XFS_GC_BIO_NEW,
92 		XFS_GC_BIO_DONE,
93 	} state;
94 
95 	/*
96 	 * Pointer to the inode and byte range in the inode that this
97 	 * GC chunk is operating on.
98 	 */
99 	struct xfs_inode		*ip;
100 	loff_t				offset;
101 	unsigned int			len;
102 
103 	/*
104 	 * Existing startblock (in the zone to be freed) and newly assigned
105 	 * daddr in the zone GCed into.
106 	 */
107 	xfs_fsblock_t			old_startblock;
108 	xfs_daddr_t			new_daddr;
109 	struct xfs_zone_scratch		*scratch;
110 
111 	/* Are we writing to a sequential write required zone? */
112 	bool				is_seq;
113 
114 	/* Open Zone being written to */
115 	struct xfs_open_zone		*oz;
116 
117 	/* Bio used for reads and writes, including the bvec used by it */
118 	struct bio_vec			bv;
119 	struct bio			bio;	/* must be last */
120 };
121 
122 #define XFS_ZONE_GC_RECS		1024
123 
124 /* iterator, needs to be reinitialized for each victim zone */
125 struct xfs_zone_gc_iter {
126 	struct xfs_rtgroup		*victim_rtg;
127 	unsigned int			rec_count;
128 	unsigned int			rec_idx;
129 	xfs_agblock_t			next_startblock;
130 	struct xfs_rmap_irec		*recs;
131 };
132 
133 /*
134  * Per-mount GC state.
135  */
136 struct xfs_zone_gc_data {
137 	struct xfs_mount		*mp;
138 
139 	/* bioset used to allocate the gc_bios */
140 	struct bio_set			bio_set;
141 
142 	/*
143 	 * Scratchpad used, and index to indicated which one is used.
144 	 */
145 	struct xfs_zone_scratch		scratch[XFS_ZONE_GC_NR_SCRATCH];
146 	unsigned int			scratch_idx;
147 
148 	/*
149 	 * List of bios currently being read, written and reset.
150 	 * These lists are only accessed by the GC thread itself, and must only
151 	 * be processed in order.
152 	 */
153 	struct list_head		reading;
154 	struct list_head		writing;
155 	struct list_head		resetting;
156 
157 	/*
158 	 * Iterator for the victim zone.
159 	 */
160 	struct xfs_zone_gc_iter		iter;
161 };
162 
163 /*
164  * We aim to keep enough zones free in stock to fully use the open zone limit
165  * for data placement purposes.
166  */
167 bool
xfs_zoned_need_gc(struct xfs_mount * mp)168 xfs_zoned_need_gc(
169 	struct xfs_mount	*mp)
170 {
171 	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
172 		return false;
173 	if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) <
174 	    mp->m_groups[XG_TYPE_RTG].blocks *
175 	    (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
176 		return true;
177 	return false;
178 }
179 
180 static struct xfs_zone_gc_data *
xfs_zone_gc_data_alloc(struct xfs_mount * mp)181 xfs_zone_gc_data_alloc(
182 	struct xfs_mount	*mp)
183 {
184 	struct xfs_zone_gc_data	*data;
185 	int			i;
186 
187 	data = kzalloc(sizeof(*data), GFP_KERNEL);
188 	if (!data)
189 		return NULL;
190 	data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
191 			GFP_KERNEL);
192 	if (!data->iter.recs)
193 		goto out_free_data;
194 
195 	/*
196 	 * We actually only need a single bio_vec.  It would be nice to have
197 	 * a flag that only allocates the inline bvecs and not the separate
198 	 * bvec pool.
199 	 */
200 	if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
201 			BIOSET_NEED_BVECS))
202 		goto out_free_recs;
203 	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
204 		data->scratch[i].folio =
205 			folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
206 		if (!data->scratch[i].folio)
207 			goto out_free_scratch;
208 	}
209 	INIT_LIST_HEAD(&data->reading);
210 	INIT_LIST_HEAD(&data->writing);
211 	INIT_LIST_HEAD(&data->resetting);
212 	data->mp = mp;
213 	return data;
214 
215 out_free_scratch:
216 	while (--i >= 0)
217 		folio_put(data->scratch[i].folio);
218 	bioset_exit(&data->bio_set);
219 out_free_recs:
220 	kfree(data->iter.recs);
221 out_free_data:
222 	kfree(data);
223 	return NULL;
224 }
225 
226 static void
xfs_zone_gc_data_free(struct xfs_zone_gc_data * data)227 xfs_zone_gc_data_free(
228 	struct xfs_zone_gc_data	*data)
229 {
230 	int			i;
231 
232 	for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
233 		folio_put(data->scratch[i].folio);
234 	bioset_exit(&data->bio_set);
235 	kfree(data->iter.recs);
236 	kfree(data);
237 }
238 
239 static void
xfs_zone_gc_iter_init(struct xfs_zone_gc_iter * iter,struct xfs_rtgroup * victim_rtg)240 xfs_zone_gc_iter_init(
241 	struct xfs_zone_gc_iter	*iter,
242 	struct xfs_rtgroup	*victim_rtg)
243 
244 {
245 	iter->next_startblock = 0;
246 	iter->rec_count = 0;
247 	iter->rec_idx = 0;
248 	iter->victim_rtg = victim_rtg;
249 }
250 
251 /*
252  * Query the rmap of the victim zone to gather the records to evacuate.
253  */
254 static int
xfs_zone_gc_query_cb(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * irec,void * private)255 xfs_zone_gc_query_cb(
256 	struct xfs_btree_cur	*cur,
257 	const struct xfs_rmap_irec *irec,
258 	void			*private)
259 {
260 	struct xfs_zone_gc_iter	*iter = private;
261 
262 	ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
263 	ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
264 	ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
265 
266 	iter->recs[iter->rec_count] = *irec;
267 	if (++iter->rec_count == XFS_ZONE_GC_RECS) {
268 		iter->next_startblock =
269 			irec->rm_startblock + irec->rm_blockcount;
270 		return 1;
271 	}
272 	return 0;
273 }
274 
275 #define cmp_int(l, r)		((l > r) - (l < r))
276 
277 static int
xfs_zone_gc_rmap_rec_cmp(const void * a,const void * b)278 xfs_zone_gc_rmap_rec_cmp(
279 	const void			*a,
280 	const void			*b)
281 {
282 	const struct xfs_rmap_irec	*reca = a;
283 	const struct xfs_rmap_irec	*recb = b;
284 	int				diff;
285 
286 	diff = cmp_int(reca->rm_owner, recb->rm_owner);
287 	if (diff)
288 		return diff;
289 	return cmp_int(reca->rm_offset, recb->rm_offset);
290 }
291 
292 static int
xfs_zone_gc_query(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter)293 xfs_zone_gc_query(
294 	struct xfs_mount	*mp,
295 	struct xfs_zone_gc_iter	*iter)
296 {
297 	struct xfs_rtgroup	*rtg = iter->victim_rtg;
298 	struct xfs_rmap_irec	ri_low = { };
299 	struct xfs_rmap_irec	ri_high;
300 	struct xfs_btree_cur	*cur;
301 	struct xfs_trans	*tp;
302 	int			error;
303 
304 	ASSERT(iter->next_startblock <= rtg_blocks(rtg));
305 	if (iter->next_startblock == rtg_blocks(rtg))
306 		goto done;
307 
308 	ASSERT(iter->next_startblock < rtg_blocks(rtg));
309 	ri_low.rm_startblock = iter->next_startblock;
310 	memset(&ri_high, 0xFF, sizeof(ri_high));
311 
312 	iter->rec_idx = 0;
313 	iter->rec_count = 0;
314 
315 	error = xfs_trans_alloc_empty(mp, &tp);
316 	if (error)
317 		return error;
318 
319 	xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
320 	cur = xfs_rtrmapbt_init_cursor(tp, rtg);
321 	error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
322 			xfs_zone_gc_query_cb, iter);
323 	xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
324 	xfs_btree_del_cursor(cur, error < 0 ? error : 0);
325 	xfs_trans_cancel(tp);
326 
327 	if (error < 0)
328 		return error;
329 
330 	/*
331 	 * Sort the rmap records by inode number and increasing offset to
332 	 * defragment the mappings.
333 	 *
334 	 * This could be further enhanced by an even bigger look ahead window,
335 	 * but that's better left until we have better detection of changes to
336 	 * inode mapping to avoid the potential of GCing already dead data.
337 	 */
338 	sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
339 			xfs_zone_gc_rmap_rec_cmp, NULL);
340 
341 	if (error == 0) {
342 		/*
343 		 * We finished iterating through the zone.
344 		 */
345 		iter->next_startblock = rtg_blocks(rtg);
346 		if (iter->rec_count == 0)
347 			goto done;
348 	}
349 
350 	return 0;
351 done:
352 	xfs_rtgroup_rele(iter->victim_rtg);
353 	iter->victim_rtg = NULL;
354 	return 0;
355 }
356 
357 static bool
xfs_zone_gc_iter_next(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter,struct xfs_rmap_irec * chunk_rec,struct xfs_inode ** ipp)358 xfs_zone_gc_iter_next(
359 	struct xfs_mount	*mp,
360 	struct xfs_zone_gc_iter	*iter,
361 	struct xfs_rmap_irec	*chunk_rec,
362 	struct xfs_inode	**ipp)
363 {
364 	struct xfs_rmap_irec	*irec;
365 	int			error;
366 
367 	if (!iter->victim_rtg)
368 		return false;
369 
370 retry:
371 	if (iter->rec_idx == iter->rec_count) {
372 		error = xfs_zone_gc_query(mp, iter);
373 		if (error)
374 			goto fail;
375 		if (!iter->victim_rtg)
376 			return false;
377 	}
378 
379 	irec = &iter->recs[iter->rec_idx];
380 	error = xfs_iget(mp, NULL, irec->rm_owner,
381 			XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
382 	if (error) {
383 		/*
384 		 * If the inode was already deleted, skip over it.
385 		 */
386 		if (error == -ENOENT) {
387 			iter->rec_idx++;
388 			goto retry;
389 		}
390 		goto fail;
391 	}
392 
393 	if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
394 		iter->rec_idx++;
395 		xfs_irele(*ipp);
396 		goto retry;
397 	}
398 
399 	*chunk_rec = *irec;
400 	return true;
401 
402 fail:
403 	xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
404 	return false;
405 }
406 
407 static void
xfs_zone_gc_iter_advance(struct xfs_zone_gc_iter * iter,xfs_extlen_t count_fsb)408 xfs_zone_gc_iter_advance(
409 	struct xfs_zone_gc_iter	*iter,
410 	xfs_extlen_t		count_fsb)
411 {
412 	struct xfs_rmap_irec	*irec = &iter->recs[iter->rec_idx];
413 
414 	irec->rm_offset += count_fsb;
415 	irec->rm_startblock += count_fsb;
416 	irec->rm_blockcount -= count_fsb;
417 	if (!irec->rm_blockcount)
418 		iter->rec_idx++;
419 }
420 
421 static struct xfs_rtgroup *
xfs_zone_gc_pick_victim_from(struct xfs_mount * mp,uint32_t bucket)422 xfs_zone_gc_pick_victim_from(
423 	struct xfs_mount	*mp,
424 	uint32_t		bucket)
425 {
426 	struct xfs_zone_info	*zi = mp->m_zone_info;
427 	uint32_t		victim_used = U32_MAX;
428 	struct xfs_rtgroup	*victim_rtg = NULL;
429 	uint32_t		bit;
430 
431 	if (!zi->zi_used_bucket_entries[bucket])
432 		return NULL;
433 
434 	for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
435 			mp->m_sb.sb_rgcount) {
436 		struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
437 
438 		if (!rtg)
439 			continue;
440 
441 		/* skip zones that are just waiting for a reset */
442 		if (rtg_rmap(rtg)->i_used_blocks == 0 ||
443 		    rtg_rmap(rtg)->i_used_blocks >= victim_used) {
444 			xfs_rtgroup_rele(rtg);
445 			continue;
446 		}
447 
448 		if (victim_rtg)
449 			xfs_rtgroup_rele(victim_rtg);
450 		victim_rtg = rtg;
451 		victim_used = rtg_rmap(rtg)->i_used_blocks;
452 
453 		/*
454 		 * Any zone that is less than 1 percent used is fair game for
455 		 * instant reclaim. All of these zones are in the last
456 		 * bucket, so avoid the expensive division for the zones
457 		 * in the other buckets.
458 		 */
459 		if (bucket == 0 &&
460 		    rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
461 			break;
462 	}
463 
464 	return victim_rtg;
465 }
466 
467 /*
468  * Iterate through all zones marked as reclaimable and find a candidate to
469  * reclaim.
470  */
471 static bool
xfs_zone_gc_select_victim(struct xfs_zone_gc_data * data)472 xfs_zone_gc_select_victim(
473 	struct xfs_zone_gc_data	*data)
474 {
475 	struct xfs_zone_gc_iter	*iter = &data->iter;
476 	struct xfs_mount	*mp = data->mp;
477 	struct xfs_zone_info	*zi = mp->m_zone_info;
478 	struct xfs_rtgroup	*victim_rtg = NULL;
479 	unsigned int		bucket;
480 
481 	if (xfs_is_shutdown(mp))
482 		return false;
483 
484 	if (iter->victim_rtg)
485 		return true;
486 
487 	/*
488 	 * Don't start new work if we are asked to stop or park.
489 	 */
490 	if (kthread_should_stop() || kthread_should_park())
491 		return false;
492 
493 	if (!xfs_zoned_need_gc(mp))
494 		return false;
495 
496 	spin_lock(&zi->zi_used_buckets_lock);
497 	for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
498 		victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
499 		if (victim_rtg)
500 			break;
501 	}
502 	spin_unlock(&zi->zi_used_buckets_lock);
503 
504 	if (!victim_rtg)
505 		return false;
506 
507 	trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
508 	xfs_zone_gc_iter_init(iter, victim_rtg);
509 	return true;
510 }
511 
512 static struct xfs_open_zone *
xfs_zone_gc_steal_open(struct xfs_zone_info * zi)513 xfs_zone_gc_steal_open(
514 	struct xfs_zone_info	*zi)
515 {
516 	struct xfs_open_zone	*oz, *found = NULL;
517 
518 	spin_lock(&zi->zi_open_zones_lock);
519 	list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
520 		if (!found ||
521 		    oz->oz_write_pointer < found->oz_write_pointer)
522 			found = oz;
523 	}
524 
525 	if (found) {
526 		found->oz_is_gc = true;
527 		list_del_init(&found->oz_entry);
528 		zi->zi_nr_open_zones--;
529 	}
530 
531 	spin_unlock(&zi->zi_open_zones_lock);
532 	return found;
533 }
534 
535 static struct xfs_open_zone *
xfs_zone_gc_select_target(struct xfs_mount * mp)536 xfs_zone_gc_select_target(
537 	struct xfs_mount	*mp)
538 {
539 	struct xfs_zone_info	*zi = mp->m_zone_info;
540 	struct xfs_open_zone	*oz = zi->zi_open_gc_zone;
541 
542 	/*
543 	 * We need to wait for pending writes to finish.
544 	 */
545 	if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
546 		return NULL;
547 
548 	ASSERT(zi->zi_nr_open_zones <=
549 		mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
550 	oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
551 	if (oz)
552 		trace_xfs_zone_gc_target_opened(oz->oz_rtg);
553 	spin_lock(&zi->zi_open_zones_lock);
554 	zi->zi_open_gc_zone = oz;
555 	spin_unlock(&zi->zi_open_zones_lock);
556 	return oz;
557 }
558 
559 /*
560  * Ensure we have a valid open zone to write the GC data to.
561  *
562  * If the current target zone has space keep writing to it, else first wait for
563  * all pending writes and then pick a new one.
564  */
565 static struct xfs_open_zone *
xfs_zone_gc_ensure_target(struct xfs_mount * mp)566 xfs_zone_gc_ensure_target(
567 	struct xfs_mount	*mp)
568 {
569 	struct xfs_open_zone	*oz = mp->m_zone_info->zi_open_gc_zone;
570 
571 	if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
572 		return xfs_zone_gc_select_target(mp);
573 	return oz;
574 }
575 
576 static unsigned int
xfs_zone_gc_scratch_available(struct xfs_zone_gc_data * data)577 xfs_zone_gc_scratch_available(
578 	struct xfs_zone_gc_data	*data)
579 {
580 	return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
581 }
582 
583 static bool
xfs_zone_gc_space_available(struct xfs_zone_gc_data * data)584 xfs_zone_gc_space_available(
585 	struct xfs_zone_gc_data	*data)
586 {
587 	struct xfs_open_zone	*oz;
588 
589 	oz = xfs_zone_gc_ensure_target(data->mp);
590 	if (!oz)
591 		return false;
592 	return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) &&
593 		xfs_zone_gc_scratch_available(data);
594 }
595 
596 static void
xfs_zone_gc_end_io(struct bio * bio)597 xfs_zone_gc_end_io(
598 	struct bio		*bio)
599 {
600 	struct xfs_gc_bio	*chunk =
601 		container_of(bio, struct xfs_gc_bio, bio);
602 	struct xfs_zone_gc_data	*data = chunk->data;
603 
604 	WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
605 	wake_up_process(data->mp->m_zone_info->zi_gc_thread);
606 }
607 
608 static struct xfs_open_zone *
xfs_zone_gc_alloc_blocks(struct xfs_zone_gc_data * data,xfs_extlen_t * count_fsb,xfs_daddr_t * daddr,bool * is_seq)609 xfs_zone_gc_alloc_blocks(
610 	struct xfs_zone_gc_data	*data,
611 	xfs_extlen_t		*count_fsb,
612 	xfs_daddr_t		*daddr,
613 	bool			*is_seq)
614 {
615 	struct xfs_mount	*mp = data->mp;
616 	struct xfs_open_zone	*oz;
617 
618 	oz = xfs_zone_gc_ensure_target(mp);
619 	if (!oz)
620 		return NULL;
621 
622 	*count_fsb = min(*count_fsb,
623 		XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
624 
625 	/*
626 	 * Directly allocate GC blocks from the reserved pool.
627 	 *
628 	 * If we'd take them from the normal pool we could be stealing blocks
629 	 * from a regular writer, which would then have to wait for GC and
630 	 * deadlock.
631 	 */
632 	spin_lock(&mp->m_sb_lock);
633 	*count_fsb = min(*count_fsb,
634 			rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer);
635 	*count_fsb = min3(*count_fsb,
636 			mp->m_free[XC_FREE_RTEXTENTS].res_avail,
637 			mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
638 	mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
639 	mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
640 	spin_unlock(&mp->m_sb_lock);
641 
642 	if (!*count_fsb)
643 		return NULL;
644 
645 	*daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
646 	*is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
647 	if (!*is_seq)
648 		*daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer);
649 	oz->oz_write_pointer += *count_fsb;
650 	atomic_inc(&oz->oz_ref);
651 	return oz;
652 }
653 
654 static bool
xfs_zone_gc_start_chunk(struct xfs_zone_gc_data * data)655 xfs_zone_gc_start_chunk(
656 	struct xfs_zone_gc_data	*data)
657 {
658 	struct xfs_zone_gc_iter	*iter = &data->iter;
659 	struct xfs_mount	*mp = data->mp;
660 	struct block_device	*bdev = mp->m_rtdev_targp->bt_bdev;
661 	struct xfs_open_zone	*oz;
662 	struct xfs_rmap_irec	irec;
663 	struct xfs_gc_bio	*chunk;
664 	struct xfs_inode	*ip;
665 	struct bio		*bio;
666 	xfs_daddr_t		daddr;
667 	bool			is_seq;
668 
669 	if (xfs_is_shutdown(mp))
670 		return false;
671 
672 	if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
673 		return false;
674 	oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
675 			&is_seq);
676 	if (!oz) {
677 		xfs_irele(ip);
678 		return false;
679 	}
680 
681 	bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
682 
683 	chunk = container_of(bio, struct xfs_gc_bio, bio);
684 	chunk->ip = ip;
685 	chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
686 	chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
687 	chunk->old_startblock =
688 		xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
689 	chunk->new_daddr = daddr;
690 	chunk->is_seq = is_seq;
691 	chunk->scratch = &data->scratch[data->scratch_idx];
692 	chunk->data = data;
693 	chunk->oz = oz;
694 
695 	bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
696 	bio->bi_end_io = xfs_zone_gc_end_io;
697 	bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
698 			chunk->scratch->offset);
699 	chunk->scratch->offset += chunk->len;
700 	if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
701 		data->scratch_idx =
702 			(data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
703 	}
704 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
705 	list_add_tail(&chunk->entry, &data->reading);
706 	xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
707 
708 	submit_bio(bio);
709 	return true;
710 }
711 
712 static void
xfs_zone_gc_free_chunk(struct xfs_gc_bio * chunk)713 xfs_zone_gc_free_chunk(
714 	struct xfs_gc_bio	*chunk)
715 {
716 	list_del(&chunk->entry);
717 	xfs_open_zone_put(chunk->oz);
718 	xfs_irele(chunk->ip);
719 	bio_put(&chunk->bio);
720 }
721 
722 static void
xfs_zone_gc_submit_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)723 xfs_zone_gc_submit_write(
724 	struct xfs_zone_gc_data	*data,
725 	struct xfs_gc_bio	*chunk)
726 {
727 	if (chunk->is_seq) {
728 		chunk->bio.bi_opf &= ~REQ_OP_WRITE;
729 		chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
730 	}
731 	chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
732 	chunk->bio.bi_end_io = xfs_zone_gc_end_io;
733 	submit_bio(&chunk->bio);
734 }
735 
736 static struct xfs_gc_bio *
xfs_zone_gc_split_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)737 xfs_zone_gc_split_write(
738 	struct xfs_zone_gc_data	*data,
739 	struct xfs_gc_bio	*chunk)
740 {
741 	struct queue_limits	*lim =
742 		&bdev_get_queue(chunk->bio.bi_bdev)->limits;
743 	struct xfs_gc_bio	*split_chunk;
744 	int			split_sectors;
745 	unsigned int		split_len;
746 	struct bio		*split;
747 	unsigned int		nsegs;
748 
749 	if (!chunk->is_seq)
750 		return NULL;
751 
752 	split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
753 			lim->max_zone_append_sectors << SECTOR_SHIFT);
754 	if (!split_sectors)
755 		return NULL;
756 
757 	/* ensure the split chunk is still block size aligned */
758 	split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
759 			data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
760 	split_len = split_sectors << SECTOR_SHIFT;
761 
762 	split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
763 	split_chunk = container_of(split, struct xfs_gc_bio, bio);
764 	split_chunk->data = data;
765 	ihold(VFS_I(chunk->ip));
766 	split_chunk->ip = chunk->ip;
767 	split_chunk->is_seq = chunk->is_seq;
768 	split_chunk->scratch = chunk->scratch;
769 	split_chunk->offset = chunk->offset;
770 	split_chunk->len = split_len;
771 	split_chunk->old_startblock = chunk->old_startblock;
772 	split_chunk->new_daddr = chunk->new_daddr;
773 	split_chunk->oz = chunk->oz;
774 	atomic_inc(&chunk->oz->oz_ref);
775 
776 	chunk->offset += split_len;
777 	chunk->len -= split_len;
778 	chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
779 
780 	/* add right before the original chunk */
781 	WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
782 	list_add_tail(&split_chunk->entry, &chunk->entry);
783 	return split_chunk;
784 }
785 
786 static void
xfs_zone_gc_write_chunk(struct xfs_gc_bio * chunk)787 xfs_zone_gc_write_chunk(
788 	struct xfs_gc_bio	*chunk)
789 {
790 	struct xfs_zone_gc_data	*data = chunk->data;
791 	struct xfs_mount	*mp = chunk->ip->i_mount;
792 	unsigned int		folio_offset = chunk->bio.bi_io_vec->bv_offset;
793 	struct xfs_gc_bio	*split_chunk;
794 
795 	if (chunk->bio.bi_status)
796 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
797 	if (xfs_is_shutdown(mp)) {
798 		xfs_zone_gc_free_chunk(chunk);
799 		return;
800 	}
801 
802 	WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
803 	list_move_tail(&chunk->entry, &data->writing);
804 
805 	bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
806 	bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
807 			folio_offset);
808 
809 	while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
810 		xfs_zone_gc_submit_write(data, split_chunk);
811 	xfs_zone_gc_submit_write(data, chunk);
812 }
813 
814 static void
xfs_zone_gc_finish_chunk(struct xfs_gc_bio * chunk)815 xfs_zone_gc_finish_chunk(
816 	struct xfs_gc_bio	*chunk)
817 {
818 	uint			iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
819 	struct xfs_inode	*ip = chunk->ip;
820 	struct xfs_mount	*mp = ip->i_mount;
821 	int			error;
822 
823 	if (chunk->bio.bi_status)
824 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
825 	if (xfs_is_shutdown(mp)) {
826 		xfs_zone_gc_free_chunk(chunk);
827 		return;
828 	}
829 
830 	chunk->scratch->freed += chunk->len;
831 	if (chunk->scratch->freed == chunk->scratch->offset) {
832 		chunk->scratch->offset = 0;
833 		chunk->scratch->freed = 0;
834 	}
835 
836 	/*
837 	 * Cycle through the iolock and wait for direct I/O and layouts to
838 	 * ensure no one is reading from the old mapping before it goes away.
839 	 *
840 	 * Note that xfs_zoned_end_io() below checks that no other writer raced
841 	 * with us to update the mapping by checking that the old startblock
842 	 * didn't change.
843 	 */
844 	xfs_ilock(ip, iolock);
845 	error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
846 	if (!error)
847 		inode_dio_wait(VFS_I(ip));
848 	xfs_iunlock(ip, iolock);
849 	if (error)
850 		goto free;
851 
852 	if (chunk->is_seq)
853 		chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
854 	error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
855 			chunk->new_daddr, chunk->oz, chunk->old_startblock);
856 free:
857 	if (error)
858 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
859 	xfs_zone_gc_free_chunk(chunk);
860 }
861 
862 static void
xfs_zone_gc_finish_reset(struct xfs_gc_bio * chunk)863 xfs_zone_gc_finish_reset(
864 	struct xfs_gc_bio	*chunk)
865 {
866 	struct xfs_rtgroup	*rtg = chunk->bio.bi_private;
867 	struct xfs_mount	*mp = rtg_mount(rtg);
868 	struct xfs_zone_info	*zi = mp->m_zone_info;
869 
870 	if (chunk->bio.bi_status) {
871 		xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
872 		goto out;
873 	}
874 
875 	xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
876 	atomic_inc(&zi->zi_nr_free_zones);
877 
878 	xfs_zoned_add_available(mp, rtg_blocks(rtg));
879 
880 	wake_up_all(&zi->zi_zone_wait);
881 out:
882 	list_del(&chunk->entry);
883 	bio_put(&chunk->bio);
884 }
885 
886 static bool
xfs_zone_gc_prepare_reset(struct bio * bio,struct xfs_rtgroup * rtg)887 xfs_zone_gc_prepare_reset(
888 	struct bio		*bio,
889 	struct xfs_rtgroup	*rtg)
890 {
891 	trace_xfs_zone_reset(rtg);
892 
893 	ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
894 	bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
895 	if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
896 		if (!bdev_max_discard_sectors(bio->bi_bdev))
897 			return false;
898 		bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
899 		bio->bi_iter.bi_size =
900 			XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
901 	}
902 
903 	return true;
904 }
905 
906 int
xfs_zone_gc_reset_sync(struct xfs_rtgroup * rtg)907 xfs_zone_gc_reset_sync(
908 	struct xfs_rtgroup	*rtg)
909 {
910 	int			error = 0;
911 	struct bio		bio;
912 
913 	bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
914 			REQ_OP_ZONE_RESET);
915 	if (xfs_zone_gc_prepare_reset(&bio, rtg))
916 		error = submit_bio_wait(&bio);
917 	bio_uninit(&bio);
918 
919 	return error;
920 }
921 
922 static void
xfs_zone_gc_reset_zones(struct xfs_zone_gc_data * data,struct xfs_group * reset_list)923 xfs_zone_gc_reset_zones(
924 	struct xfs_zone_gc_data	*data,
925 	struct xfs_group	*reset_list)
926 {
927 	struct xfs_group	*next = reset_list;
928 
929 	if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
930 		xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
931 		return;
932 	}
933 
934 	do {
935 		struct xfs_rtgroup	*rtg = to_rtg(next);
936 		struct xfs_gc_bio	*chunk;
937 		struct bio		*bio;
938 
939 		xfs_log_force_inode(rtg_rmap(rtg));
940 
941 		next = rtg_group(rtg)->xg_next_reset;
942 		rtg_group(rtg)->xg_next_reset = NULL;
943 
944 		bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
945 				0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
946 		bio->bi_private = rtg;
947 		bio->bi_end_io = xfs_zone_gc_end_io;
948 
949 		chunk = container_of(bio, struct xfs_gc_bio, bio);
950 		chunk->data = data;
951 		WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
952 		list_add_tail(&chunk->entry, &data->resetting);
953 
954 		/*
955 		 * Also use the bio to drive the state machine when neither
956 		 * zone reset nor discard is supported to keep things simple.
957 		 */
958 		if (xfs_zone_gc_prepare_reset(bio, rtg))
959 			submit_bio(bio);
960 		else
961 			bio_endio(bio);
962 	} while (next);
963 }
964 
965 /*
966  * Handle the work to read and write data for GC and to reset the zones,
967  * including handling all completions.
968  *
969  * Note that the order of the chunks is preserved so that we don't undo the
970  * optimal order established by xfs_zone_gc_query().
971  */
972 static bool
xfs_zone_gc_handle_work(struct xfs_zone_gc_data * data)973 xfs_zone_gc_handle_work(
974 	struct xfs_zone_gc_data	*data)
975 {
976 	struct xfs_zone_info	*zi = data->mp->m_zone_info;
977 	struct xfs_gc_bio	*chunk, *next;
978 	struct xfs_group	*reset_list;
979 	struct blk_plug		plug;
980 
981 	spin_lock(&zi->zi_reset_list_lock);
982 	reset_list = zi->zi_reset_list;
983 	zi->zi_reset_list = NULL;
984 	spin_unlock(&zi->zi_reset_list_lock);
985 
986 	if (!xfs_zone_gc_select_victim(data) ||
987 	    !xfs_zone_gc_space_available(data)) {
988 		if (list_empty(&data->reading) &&
989 		    list_empty(&data->writing) &&
990 		    list_empty(&data->resetting) &&
991 		    !reset_list)
992 			return false;
993 	}
994 
995 	__set_current_state(TASK_RUNNING);
996 	try_to_freeze();
997 
998 	if (reset_list)
999 		xfs_zone_gc_reset_zones(data, reset_list);
1000 
1001 	list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
1002 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1003 			break;
1004 		xfs_zone_gc_finish_reset(chunk);
1005 	}
1006 
1007 	list_for_each_entry_safe(chunk, next, &data->writing, entry) {
1008 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1009 			break;
1010 		xfs_zone_gc_finish_chunk(chunk);
1011 	}
1012 
1013 	blk_start_plug(&plug);
1014 	list_for_each_entry_safe(chunk, next, &data->reading, entry) {
1015 		if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1016 			break;
1017 		xfs_zone_gc_write_chunk(chunk);
1018 	}
1019 	blk_finish_plug(&plug);
1020 
1021 	blk_start_plug(&plug);
1022 	while (xfs_zone_gc_start_chunk(data))
1023 		;
1024 	blk_finish_plug(&plug);
1025 	return true;
1026 }
1027 
1028 /*
1029  * Note that the current GC algorithm would break reflinks and thus duplicate
1030  * data that was shared by multiple owners before.  Because of that reflinks
1031  * are currently not supported on zoned file systems and can't be created or
1032  * mounted.
1033  */
1034 static int
xfs_zoned_gcd(void * private)1035 xfs_zoned_gcd(
1036 	void			*private)
1037 {
1038 	struct xfs_zone_gc_data	*data = private;
1039 	struct xfs_mount	*mp = data->mp;
1040 	struct xfs_zone_info	*zi = mp->m_zone_info;
1041 	unsigned int		nofs_flag;
1042 
1043 	nofs_flag = memalloc_nofs_save();
1044 	set_freezable();
1045 
1046 	for (;;) {
1047 		set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1048 		xfs_set_zonegc_running(mp);
1049 		if (xfs_zone_gc_handle_work(data))
1050 			continue;
1051 
1052 		if (list_empty(&data->reading) &&
1053 		    list_empty(&data->writing) &&
1054 		    list_empty(&data->resetting) &&
1055 		    !zi->zi_reset_list) {
1056 			xfs_clear_zonegc_running(mp);
1057 			xfs_zoned_resv_wake_all(mp);
1058 
1059 			if (kthread_should_stop()) {
1060 				__set_current_state(TASK_RUNNING);
1061 				break;
1062 			}
1063 
1064 			if (kthread_should_park()) {
1065 				__set_current_state(TASK_RUNNING);
1066 				kthread_parkme();
1067 				continue;
1068 			}
1069 		}
1070 
1071 		schedule();
1072 	}
1073 	xfs_clear_zonegc_running(mp);
1074 
1075 	if (data->iter.victim_rtg)
1076 		xfs_rtgroup_rele(data->iter.victim_rtg);
1077 
1078 	memalloc_nofs_restore(nofs_flag);
1079 	xfs_zone_gc_data_free(data);
1080 	return 0;
1081 }
1082 
1083 void
xfs_zone_gc_start(struct xfs_mount * mp)1084 xfs_zone_gc_start(
1085 	struct xfs_mount	*mp)
1086 {
1087 	if (xfs_has_zoned(mp))
1088 		kthread_unpark(mp->m_zone_info->zi_gc_thread);
1089 }
1090 
1091 void
xfs_zone_gc_stop(struct xfs_mount * mp)1092 xfs_zone_gc_stop(
1093 	struct xfs_mount	*mp)
1094 {
1095 	if (xfs_has_zoned(mp))
1096 		kthread_park(mp->m_zone_info->zi_gc_thread);
1097 }
1098 
1099 int
xfs_zone_gc_mount(struct xfs_mount * mp)1100 xfs_zone_gc_mount(
1101 	struct xfs_mount	*mp)
1102 {
1103 	struct xfs_zone_info	*zi = mp->m_zone_info;
1104 	struct xfs_zone_gc_data	*data;
1105 	struct xfs_open_zone	*oz;
1106 	int			error;
1107 
1108 	/*
1109 	 * If there are no free zones available for GC, pick the open zone with
1110 	 * the least used space to GC into.  This should only happen after an
1111 	 * unclean shutdown near ENOSPC while GC was ongoing.
1112 	 *
1113 	 * We also need to do this for the first gc zone allocation if we
1114 	 * unmounted while at the open limit.
1115 	 */
1116 	if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
1117 	    zi->zi_nr_open_zones == mp->m_max_open_zones)
1118 		oz = xfs_zone_gc_steal_open(zi);
1119 	else
1120 		oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
1121 	if (!oz) {
1122 		xfs_warn(mp, "unable to allocate a zone for gc");
1123 		error = -EIO;
1124 		goto out;
1125 	}
1126 
1127 	trace_xfs_zone_gc_target_opened(oz->oz_rtg);
1128 	zi->zi_open_gc_zone = oz;
1129 
1130 	data = xfs_zone_gc_data_alloc(mp);
1131 	if (!data) {
1132 		error = -ENOMEM;
1133 		goto out_put_gc_zone;
1134 	}
1135 
1136 	mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
1137 			"xfs-zone-gc/%s", mp->m_super->s_id);
1138 	if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
1139 		xfs_warn(mp, "unable to create zone gc thread");
1140 		error = PTR_ERR(mp->m_zone_info->zi_gc_thread);
1141 		goto out_free_gc_data;
1142 	}
1143 
1144 	/* xfs_zone_gc_start will unpark for rw mounts */
1145 	kthread_park(mp->m_zone_info->zi_gc_thread);
1146 	return 0;
1147 
1148 out_free_gc_data:
1149 	kfree(data);
1150 out_put_gc_zone:
1151 	xfs_open_zone_put(zi->zi_open_gc_zone);
1152 out:
1153 	return error;
1154 }
1155 
1156 void
xfs_zone_gc_unmount(struct xfs_mount * mp)1157 xfs_zone_gc_unmount(
1158 	struct xfs_mount	*mp)
1159 {
1160 	struct xfs_zone_info	*zi = mp->m_zone_info;
1161 
1162 	kthread_stop(zi->zi_gc_thread);
1163 	if (zi->zi_open_gc_zone)
1164 		xfs_open_zone_put(zi->zi_open_gc_zone);
1165 }
1166