1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2023-2025 Christoph Hellwig.
4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5 */
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "xfs_trans.h"
15 #include "xfs_icache.h"
16 #include "xfs_rmap.h"
17 #include "xfs_rtbitmap.h"
18 #include "xfs_rtrmap_btree.h"
19 #include "xfs_zone_alloc.h"
20 #include "xfs_zone_priv.h"
21 #include "xfs_zones.h"
22 #include "xfs_trace.h"
23
24 /*
25 * Implement Garbage Collection (GC) of partially used zoned.
26 *
27 * To support the purely sequential writes in each zone, zoned XFS needs to be
28 * able to move data remaining in a zone out of it to reset the zone to prepare
29 * for writing to it again.
30 *
31 * This is done by the GC thread implemented in this file. To support that a
32 * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
33 * write the garbage collected data into.
34 *
35 * Whenever the available space is below the chosen threshold, the GC thread
36 * looks for potential non-empty but not fully used zones that are worth
37 * reclaiming. Once found the rmap for the victim zone is queried, and after
38 * a bit of sorting to reduce fragmentation, the still live extents are read
39 * into memory and written to the GC target zone, and the bmap btree of the
40 * files is updated to point to the new location. To avoid taking the IOLOCK
41 * and MMAPLOCK for the entire GC process and thus affecting the latency of
42 * user reads and writes to the files, the GC writes are speculative and the
43 * I/O completion checks that no other writes happened for the affected regions
44 * before remapping.
45 *
46 * Once a zone does not contain any valid data, be that through GC or user
47 * block removal, it is queued for for a zone reset. The reset operation
48 * carefully ensures that the RT device cache is flushed and all transactions
49 * referencing the rmap have been committed to disk.
50 */
51
52 /*
53 * Size of each GC scratch pad. This is also the upper bound for each
54 * GC I/O, which helps to keep latency down.
55 */
56 #define XFS_GC_CHUNK_SIZE SZ_1M
57
58 /*
59 * Scratchpad data to read GCed data into.
60 *
61 * The offset member tracks where the next allocation starts, and freed tracks
62 * the amount of space that is not used anymore.
63 */
64 #define XFS_ZONE_GC_NR_SCRATCH 2
65 struct xfs_zone_scratch {
66 struct folio *folio;
67 unsigned int offset;
68 unsigned int freed;
69 };
70
71 /*
72 * Chunk that is read and written for each GC operation.
73 *
74 * Note that for writes to actual zoned devices, the chunk can be split when
75 * reaching the hardware limit.
76 */
77 struct xfs_gc_bio {
78 struct xfs_zone_gc_data *data;
79
80 /*
81 * Entry into the reading/writing/resetting list. Only accessed from
82 * the GC thread, so no locking needed.
83 */
84 struct list_head entry;
85
86 /*
87 * State of this gc_bio. Done means the current I/O completed.
88 * Set from the bio end I/O handler, read from the GC thread.
89 */
90 enum {
91 XFS_GC_BIO_NEW,
92 XFS_GC_BIO_DONE,
93 } state;
94
95 /*
96 * Pointer to the inode and byte range in the inode that this
97 * GC chunk is operating on.
98 */
99 struct xfs_inode *ip;
100 loff_t offset;
101 unsigned int len;
102
103 /*
104 * Existing startblock (in the zone to be freed) and newly assigned
105 * daddr in the zone GCed into.
106 */
107 xfs_fsblock_t old_startblock;
108 xfs_daddr_t new_daddr;
109 struct xfs_zone_scratch *scratch;
110
111 /* Are we writing to a sequential write required zone? */
112 bool is_seq;
113
114 /* Open Zone being written to */
115 struct xfs_open_zone *oz;
116
117 /* Bio used for reads and writes, including the bvec used by it */
118 struct bio_vec bv;
119 struct bio bio; /* must be last */
120 };
121
122 #define XFS_ZONE_GC_RECS 1024
123
124 /* iterator, needs to be reinitialized for each victim zone */
125 struct xfs_zone_gc_iter {
126 struct xfs_rtgroup *victim_rtg;
127 unsigned int rec_count;
128 unsigned int rec_idx;
129 xfs_agblock_t next_startblock;
130 struct xfs_rmap_irec *recs;
131 };
132
133 /*
134 * Per-mount GC state.
135 */
136 struct xfs_zone_gc_data {
137 struct xfs_mount *mp;
138
139 /* bioset used to allocate the gc_bios */
140 struct bio_set bio_set;
141
142 /*
143 * Scratchpad used, and index to indicated which one is used.
144 */
145 struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH];
146 unsigned int scratch_idx;
147
148 /*
149 * List of bios currently being read, written and reset.
150 * These lists are only accessed by the GC thread itself, and must only
151 * be processed in order.
152 */
153 struct list_head reading;
154 struct list_head writing;
155 struct list_head resetting;
156
157 /*
158 * Iterator for the victim zone.
159 */
160 struct xfs_zone_gc_iter iter;
161 };
162
163 /*
164 * We aim to keep enough zones free in stock to fully use the open zone limit
165 * for data placement purposes.
166 */
167 bool
xfs_zoned_need_gc(struct xfs_mount * mp)168 xfs_zoned_need_gc(
169 struct xfs_mount *mp)
170 {
171 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE))
172 return false;
173 if (xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE) <
174 mp->m_groups[XG_TYPE_RTG].blocks *
175 (mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
176 return true;
177 return false;
178 }
179
180 static struct xfs_zone_gc_data *
xfs_zone_gc_data_alloc(struct xfs_mount * mp)181 xfs_zone_gc_data_alloc(
182 struct xfs_mount *mp)
183 {
184 struct xfs_zone_gc_data *data;
185 int i;
186
187 data = kzalloc(sizeof(*data), GFP_KERNEL);
188 if (!data)
189 return NULL;
190 data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs),
191 GFP_KERNEL);
192 if (!data->iter.recs)
193 goto out_free_data;
194
195 /*
196 * We actually only need a single bio_vec. It would be nice to have
197 * a flag that only allocates the inline bvecs and not the separate
198 * bvec pool.
199 */
200 if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
201 BIOSET_NEED_BVECS))
202 goto out_free_recs;
203 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) {
204 data->scratch[i].folio =
205 folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE));
206 if (!data->scratch[i].folio)
207 goto out_free_scratch;
208 }
209 INIT_LIST_HEAD(&data->reading);
210 INIT_LIST_HEAD(&data->writing);
211 INIT_LIST_HEAD(&data->resetting);
212 data->mp = mp;
213 return data;
214
215 out_free_scratch:
216 while (--i >= 0)
217 folio_put(data->scratch[i].folio);
218 bioset_exit(&data->bio_set);
219 out_free_recs:
220 kfree(data->iter.recs);
221 out_free_data:
222 kfree(data);
223 return NULL;
224 }
225
226 static void
xfs_zone_gc_data_free(struct xfs_zone_gc_data * data)227 xfs_zone_gc_data_free(
228 struct xfs_zone_gc_data *data)
229 {
230 int i;
231
232 for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++)
233 folio_put(data->scratch[i].folio);
234 bioset_exit(&data->bio_set);
235 kfree(data->iter.recs);
236 kfree(data);
237 }
238
239 static void
xfs_zone_gc_iter_init(struct xfs_zone_gc_iter * iter,struct xfs_rtgroup * victim_rtg)240 xfs_zone_gc_iter_init(
241 struct xfs_zone_gc_iter *iter,
242 struct xfs_rtgroup *victim_rtg)
243
244 {
245 iter->next_startblock = 0;
246 iter->rec_count = 0;
247 iter->rec_idx = 0;
248 iter->victim_rtg = victim_rtg;
249 }
250
251 /*
252 * Query the rmap of the victim zone to gather the records to evacuate.
253 */
254 static int
xfs_zone_gc_query_cb(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * irec,void * private)255 xfs_zone_gc_query_cb(
256 struct xfs_btree_cur *cur,
257 const struct xfs_rmap_irec *irec,
258 void *private)
259 {
260 struct xfs_zone_gc_iter *iter = private;
261
262 ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
263 ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
264 ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
265
266 iter->recs[iter->rec_count] = *irec;
267 if (++iter->rec_count == XFS_ZONE_GC_RECS) {
268 iter->next_startblock =
269 irec->rm_startblock + irec->rm_blockcount;
270 return 1;
271 }
272 return 0;
273 }
274
275 #define cmp_int(l, r) ((l > r) - (l < r))
276
277 static int
xfs_zone_gc_rmap_rec_cmp(const void * a,const void * b)278 xfs_zone_gc_rmap_rec_cmp(
279 const void *a,
280 const void *b)
281 {
282 const struct xfs_rmap_irec *reca = a;
283 const struct xfs_rmap_irec *recb = b;
284 int diff;
285
286 diff = cmp_int(reca->rm_owner, recb->rm_owner);
287 if (diff)
288 return diff;
289 return cmp_int(reca->rm_offset, recb->rm_offset);
290 }
291
292 static int
xfs_zone_gc_query(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter)293 xfs_zone_gc_query(
294 struct xfs_mount *mp,
295 struct xfs_zone_gc_iter *iter)
296 {
297 struct xfs_rtgroup *rtg = iter->victim_rtg;
298 struct xfs_rmap_irec ri_low = { };
299 struct xfs_rmap_irec ri_high;
300 struct xfs_btree_cur *cur;
301 struct xfs_trans *tp;
302 int error;
303
304 ASSERT(iter->next_startblock <= rtg_blocks(rtg));
305 if (iter->next_startblock == rtg_blocks(rtg))
306 goto done;
307
308 ASSERT(iter->next_startblock < rtg_blocks(rtg));
309 ri_low.rm_startblock = iter->next_startblock;
310 memset(&ri_high, 0xFF, sizeof(ri_high));
311
312 iter->rec_idx = 0;
313 iter->rec_count = 0;
314
315 error = xfs_trans_alloc_empty(mp, &tp);
316 if (error)
317 return error;
318
319 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
320 cur = xfs_rtrmapbt_init_cursor(tp, rtg);
321 error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
322 xfs_zone_gc_query_cb, iter);
323 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
324 xfs_btree_del_cursor(cur, error < 0 ? error : 0);
325 xfs_trans_cancel(tp);
326
327 if (error < 0)
328 return error;
329
330 /*
331 * Sort the rmap records by inode number and increasing offset to
332 * defragment the mappings.
333 *
334 * This could be further enhanced by an even bigger look ahead window,
335 * but that's better left until we have better detection of changes to
336 * inode mapping to avoid the potential of GCing already dead data.
337 */
338 sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
339 xfs_zone_gc_rmap_rec_cmp, NULL);
340
341 if (error == 0) {
342 /*
343 * We finished iterating through the zone.
344 */
345 iter->next_startblock = rtg_blocks(rtg);
346 if (iter->rec_count == 0)
347 goto done;
348 }
349
350 return 0;
351 done:
352 xfs_rtgroup_rele(iter->victim_rtg);
353 iter->victim_rtg = NULL;
354 return 0;
355 }
356
357 static bool
xfs_zone_gc_iter_next(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter,struct xfs_rmap_irec * chunk_rec,struct xfs_inode ** ipp)358 xfs_zone_gc_iter_next(
359 struct xfs_mount *mp,
360 struct xfs_zone_gc_iter *iter,
361 struct xfs_rmap_irec *chunk_rec,
362 struct xfs_inode **ipp)
363 {
364 struct xfs_rmap_irec *irec;
365 int error;
366
367 if (!iter->victim_rtg)
368 return false;
369
370 retry:
371 if (iter->rec_idx == iter->rec_count) {
372 error = xfs_zone_gc_query(mp, iter);
373 if (error)
374 goto fail;
375 if (!iter->victim_rtg)
376 return false;
377 }
378
379 irec = &iter->recs[iter->rec_idx];
380 error = xfs_iget(mp, NULL, irec->rm_owner,
381 XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
382 if (error) {
383 /*
384 * If the inode was already deleted, skip over it.
385 */
386 if (error == -ENOENT) {
387 iter->rec_idx++;
388 goto retry;
389 }
390 goto fail;
391 }
392
393 if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
394 iter->rec_idx++;
395 xfs_irele(*ipp);
396 goto retry;
397 }
398
399 *chunk_rec = *irec;
400 return true;
401
402 fail:
403 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
404 return false;
405 }
406
407 static void
xfs_zone_gc_iter_advance(struct xfs_zone_gc_iter * iter,xfs_extlen_t count_fsb)408 xfs_zone_gc_iter_advance(
409 struct xfs_zone_gc_iter *iter,
410 xfs_extlen_t count_fsb)
411 {
412 struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx];
413
414 irec->rm_offset += count_fsb;
415 irec->rm_startblock += count_fsb;
416 irec->rm_blockcount -= count_fsb;
417 if (!irec->rm_blockcount)
418 iter->rec_idx++;
419 }
420
421 static struct xfs_rtgroup *
xfs_zone_gc_pick_victim_from(struct xfs_mount * mp,uint32_t bucket)422 xfs_zone_gc_pick_victim_from(
423 struct xfs_mount *mp,
424 uint32_t bucket)
425 {
426 struct xfs_zone_info *zi = mp->m_zone_info;
427 uint32_t victim_used = U32_MAX;
428 struct xfs_rtgroup *victim_rtg = NULL;
429 uint32_t bit;
430
431 if (!zi->zi_used_bucket_entries[bucket])
432 return NULL;
433
434 for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
435 mp->m_sb.sb_rgcount) {
436 struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
437
438 if (!rtg)
439 continue;
440
441 /* skip zones that are just waiting for a reset */
442 if (rtg_rmap(rtg)->i_used_blocks == 0 ||
443 rtg_rmap(rtg)->i_used_blocks >= victim_used) {
444 xfs_rtgroup_rele(rtg);
445 continue;
446 }
447
448 if (victim_rtg)
449 xfs_rtgroup_rele(victim_rtg);
450 victim_rtg = rtg;
451 victim_used = rtg_rmap(rtg)->i_used_blocks;
452
453 /*
454 * Any zone that is less than 1 percent used is fair game for
455 * instant reclaim. All of these zones are in the last
456 * bucket, so avoid the expensive division for the zones
457 * in the other buckets.
458 */
459 if (bucket == 0 &&
460 rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
461 break;
462 }
463
464 return victim_rtg;
465 }
466
467 /*
468 * Iterate through all zones marked as reclaimable and find a candidate to
469 * reclaim.
470 */
471 static bool
xfs_zone_gc_select_victim(struct xfs_zone_gc_data * data)472 xfs_zone_gc_select_victim(
473 struct xfs_zone_gc_data *data)
474 {
475 struct xfs_zone_gc_iter *iter = &data->iter;
476 struct xfs_mount *mp = data->mp;
477 struct xfs_zone_info *zi = mp->m_zone_info;
478 struct xfs_rtgroup *victim_rtg = NULL;
479 unsigned int bucket;
480
481 if (xfs_is_shutdown(mp))
482 return false;
483
484 if (iter->victim_rtg)
485 return true;
486
487 /*
488 * Don't start new work if we are asked to stop or park.
489 */
490 if (kthread_should_stop() || kthread_should_park())
491 return false;
492
493 if (!xfs_zoned_need_gc(mp))
494 return false;
495
496 spin_lock(&zi->zi_used_buckets_lock);
497 for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
498 victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
499 if (victim_rtg)
500 break;
501 }
502 spin_unlock(&zi->zi_used_buckets_lock);
503
504 if (!victim_rtg)
505 return false;
506
507 trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
508 xfs_zone_gc_iter_init(iter, victim_rtg);
509 return true;
510 }
511
512 static struct xfs_open_zone *
xfs_zone_gc_steal_open(struct xfs_zone_info * zi)513 xfs_zone_gc_steal_open(
514 struct xfs_zone_info *zi)
515 {
516 struct xfs_open_zone *oz, *found = NULL;
517
518 spin_lock(&zi->zi_open_zones_lock);
519 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
520 if (!found ||
521 oz->oz_write_pointer < found->oz_write_pointer)
522 found = oz;
523 }
524
525 if (found) {
526 found->oz_is_gc = true;
527 list_del_init(&found->oz_entry);
528 zi->zi_nr_open_zones--;
529 }
530
531 spin_unlock(&zi->zi_open_zones_lock);
532 return found;
533 }
534
535 static struct xfs_open_zone *
xfs_zone_gc_select_target(struct xfs_mount * mp)536 xfs_zone_gc_select_target(
537 struct xfs_mount *mp)
538 {
539 struct xfs_zone_info *zi = mp->m_zone_info;
540 struct xfs_open_zone *oz = zi->zi_open_gc_zone;
541
542 /*
543 * We need to wait for pending writes to finish.
544 */
545 if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
546 return NULL;
547
548 ASSERT(zi->zi_nr_open_zones <=
549 mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
550 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
551 if (oz)
552 trace_xfs_zone_gc_target_opened(oz->oz_rtg);
553 spin_lock(&zi->zi_open_zones_lock);
554 zi->zi_open_gc_zone = oz;
555 spin_unlock(&zi->zi_open_zones_lock);
556 return oz;
557 }
558
559 /*
560 * Ensure we have a valid open zone to write the GC data to.
561 *
562 * If the current target zone has space keep writing to it, else first wait for
563 * all pending writes and then pick a new one.
564 */
565 static struct xfs_open_zone *
xfs_zone_gc_ensure_target(struct xfs_mount * mp)566 xfs_zone_gc_ensure_target(
567 struct xfs_mount *mp)
568 {
569 struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone;
570
571 if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg))
572 return xfs_zone_gc_select_target(mp);
573 return oz;
574 }
575
576 static unsigned int
xfs_zone_gc_scratch_available(struct xfs_zone_gc_data * data)577 xfs_zone_gc_scratch_available(
578 struct xfs_zone_gc_data *data)
579 {
580 return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset;
581 }
582
583 static bool
xfs_zone_gc_space_available(struct xfs_zone_gc_data * data)584 xfs_zone_gc_space_available(
585 struct xfs_zone_gc_data *data)
586 {
587 struct xfs_open_zone *oz;
588
589 oz = xfs_zone_gc_ensure_target(data->mp);
590 if (!oz)
591 return false;
592 return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) &&
593 xfs_zone_gc_scratch_available(data);
594 }
595
596 static void
xfs_zone_gc_end_io(struct bio * bio)597 xfs_zone_gc_end_io(
598 struct bio *bio)
599 {
600 struct xfs_gc_bio *chunk =
601 container_of(bio, struct xfs_gc_bio, bio);
602 struct xfs_zone_gc_data *data = chunk->data;
603
604 WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
605 wake_up_process(data->mp->m_zone_info->zi_gc_thread);
606 }
607
608 static struct xfs_open_zone *
xfs_zone_gc_alloc_blocks(struct xfs_zone_gc_data * data,xfs_extlen_t * count_fsb,xfs_daddr_t * daddr,bool * is_seq)609 xfs_zone_gc_alloc_blocks(
610 struct xfs_zone_gc_data *data,
611 xfs_extlen_t *count_fsb,
612 xfs_daddr_t *daddr,
613 bool *is_seq)
614 {
615 struct xfs_mount *mp = data->mp;
616 struct xfs_open_zone *oz;
617
618 oz = xfs_zone_gc_ensure_target(mp);
619 if (!oz)
620 return NULL;
621
622 *count_fsb = min(*count_fsb,
623 XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data)));
624
625 /*
626 * Directly allocate GC blocks from the reserved pool.
627 *
628 * If we'd take them from the normal pool we could be stealing blocks
629 * from a regular writer, which would then have to wait for GC and
630 * deadlock.
631 */
632 spin_lock(&mp->m_sb_lock);
633 *count_fsb = min(*count_fsb,
634 rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer);
635 *count_fsb = min3(*count_fsb,
636 mp->m_free[XC_FREE_RTEXTENTS].res_avail,
637 mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
638 mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
639 mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
640 spin_unlock(&mp->m_sb_lock);
641
642 if (!*count_fsb)
643 return NULL;
644
645 *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
646 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
647 if (!*is_seq)
648 *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer);
649 oz->oz_write_pointer += *count_fsb;
650 atomic_inc(&oz->oz_ref);
651 return oz;
652 }
653
654 static bool
xfs_zone_gc_start_chunk(struct xfs_zone_gc_data * data)655 xfs_zone_gc_start_chunk(
656 struct xfs_zone_gc_data *data)
657 {
658 struct xfs_zone_gc_iter *iter = &data->iter;
659 struct xfs_mount *mp = data->mp;
660 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
661 struct xfs_open_zone *oz;
662 struct xfs_rmap_irec irec;
663 struct xfs_gc_bio *chunk;
664 struct xfs_inode *ip;
665 struct bio *bio;
666 xfs_daddr_t daddr;
667 bool is_seq;
668
669 if (xfs_is_shutdown(mp))
670 return false;
671
672 if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
673 return false;
674 oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
675 &is_seq);
676 if (!oz) {
677 xfs_irele(ip);
678 return false;
679 }
680
681 bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set);
682
683 chunk = container_of(bio, struct xfs_gc_bio, bio);
684 chunk->ip = ip;
685 chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
686 chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
687 chunk->old_startblock =
688 xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
689 chunk->new_daddr = daddr;
690 chunk->is_seq = is_seq;
691 chunk->scratch = &data->scratch[data->scratch_idx];
692 chunk->data = data;
693 chunk->oz = oz;
694
695 bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
696 bio->bi_end_io = xfs_zone_gc_end_io;
697 bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len,
698 chunk->scratch->offset);
699 chunk->scratch->offset += chunk->len;
700 if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) {
701 data->scratch_idx =
702 (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH;
703 }
704 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
705 list_add_tail(&chunk->entry, &data->reading);
706 xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
707
708 submit_bio(bio);
709 return true;
710 }
711
712 static void
xfs_zone_gc_free_chunk(struct xfs_gc_bio * chunk)713 xfs_zone_gc_free_chunk(
714 struct xfs_gc_bio *chunk)
715 {
716 list_del(&chunk->entry);
717 xfs_open_zone_put(chunk->oz);
718 xfs_irele(chunk->ip);
719 bio_put(&chunk->bio);
720 }
721
722 static void
xfs_zone_gc_submit_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)723 xfs_zone_gc_submit_write(
724 struct xfs_zone_gc_data *data,
725 struct xfs_gc_bio *chunk)
726 {
727 if (chunk->is_seq) {
728 chunk->bio.bi_opf &= ~REQ_OP_WRITE;
729 chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
730 }
731 chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
732 chunk->bio.bi_end_io = xfs_zone_gc_end_io;
733 submit_bio(&chunk->bio);
734 }
735
736 static struct xfs_gc_bio *
xfs_zone_gc_split_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)737 xfs_zone_gc_split_write(
738 struct xfs_zone_gc_data *data,
739 struct xfs_gc_bio *chunk)
740 {
741 struct queue_limits *lim =
742 &bdev_get_queue(chunk->bio.bi_bdev)->limits;
743 struct xfs_gc_bio *split_chunk;
744 int split_sectors;
745 unsigned int split_len;
746 struct bio *split;
747 unsigned int nsegs;
748
749 if (!chunk->is_seq)
750 return NULL;
751
752 split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
753 lim->max_zone_append_sectors << SECTOR_SHIFT);
754 if (!split_sectors)
755 return NULL;
756
757 /* ensure the split chunk is still block size aligned */
758 split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
759 data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
760 split_len = split_sectors << SECTOR_SHIFT;
761
762 split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
763 split_chunk = container_of(split, struct xfs_gc_bio, bio);
764 split_chunk->data = data;
765 ihold(VFS_I(chunk->ip));
766 split_chunk->ip = chunk->ip;
767 split_chunk->is_seq = chunk->is_seq;
768 split_chunk->scratch = chunk->scratch;
769 split_chunk->offset = chunk->offset;
770 split_chunk->len = split_len;
771 split_chunk->old_startblock = chunk->old_startblock;
772 split_chunk->new_daddr = chunk->new_daddr;
773 split_chunk->oz = chunk->oz;
774 atomic_inc(&chunk->oz->oz_ref);
775
776 chunk->offset += split_len;
777 chunk->len -= split_len;
778 chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
779
780 /* add right before the original chunk */
781 WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
782 list_add_tail(&split_chunk->entry, &chunk->entry);
783 return split_chunk;
784 }
785
786 static void
xfs_zone_gc_write_chunk(struct xfs_gc_bio * chunk)787 xfs_zone_gc_write_chunk(
788 struct xfs_gc_bio *chunk)
789 {
790 struct xfs_zone_gc_data *data = chunk->data;
791 struct xfs_mount *mp = chunk->ip->i_mount;
792 unsigned int folio_offset = chunk->bio.bi_io_vec->bv_offset;
793 struct xfs_gc_bio *split_chunk;
794
795 if (chunk->bio.bi_status)
796 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
797 if (xfs_is_shutdown(mp)) {
798 xfs_zone_gc_free_chunk(chunk);
799 return;
800 }
801
802 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
803 list_move_tail(&chunk->entry, &data->writing);
804
805 bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE);
806 bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len,
807 folio_offset);
808
809 while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
810 xfs_zone_gc_submit_write(data, split_chunk);
811 xfs_zone_gc_submit_write(data, chunk);
812 }
813
814 static void
xfs_zone_gc_finish_chunk(struct xfs_gc_bio * chunk)815 xfs_zone_gc_finish_chunk(
816 struct xfs_gc_bio *chunk)
817 {
818 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
819 struct xfs_inode *ip = chunk->ip;
820 struct xfs_mount *mp = ip->i_mount;
821 int error;
822
823 if (chunk->bio.bi_status)
824 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
825 if (xfs_is_shutdown(mp)) {
826 xfs_zone_gc_free_chunk(chunk);
827 return;
828 }
829
830 chunk->scratch->freed += chunk->len;
831 if (chunk->scratch->freed == chunk->scratch->offset) {
832 chunk->scratch->offset = 0;
833 chunk->scratch->freed = 0;
834 }
835
836 /*
837 * Cycle through the iolock and wait for direct I/O and layouts to
838 * ensure no one is reading from the old mapping before it goes away.
839 *
840 * Note that xfs_zoned_end_io() below checks that no other writer raced
841 * with us to update the mapping by checking that the old startblock
842 * didn't change.
843 */
844 xfs_ilock(ip, iolock);
845 error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
846 if (!error)
847 inode_dio_wait(VFS_I(ip));
848 xfs_iunlock(ip, iolock);
849 if (error)
850 goto free;
851
852 if (chunk->is_seq)
853 chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
854 error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
855 chunk->new_daddr, chunk->oz, chunk->old_startblock);
856 free:
857 if (error)
858 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
859 xfs_zone_gc_free_chunk(chunk);
860 }
861
862 static void
xfs_zone_gc_finish_reset(struct xfs_gc_bio * chunk)863 xfs_zone_gc_finish_reset(
864 struct xfs_gc_bio *chunk)
865 {
866 struct xfs_rtgroup *rtg = chunk->bio.bi_private;
867 struct xfs_mount *mp = rtg_mount(rtg);
868 struct xfs_zone_info *zi = mp->m_zone_info;
869
870 if (chunk->bio.bi_status) {
871 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
872 goto out;
873 }
874
875 xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
876 atomic_inc(&zi->zi_nr_free_zones);
877
878 xfs_zoned_add_available(mp, rtg_blocks(rtg));
879
880 wake_up_all(&zi->zi_zone_wait);
881 out:
882 list_del(&chunk->entry);
883 bio_put(&chunk->bio);
884 }
885
886 static bool
xfs_zone_gc_prepare_reset(struct bio * bio,struct xfs_rtgroup * rtg)887 xfs_zone_gc_prepare_reset(
888 struct bio *bio,
889 struct xfs_rtgroup *rtg)
890 {
891 trace_xfs_zone_reset(rtg);
892
893 ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
894 bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
895 if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
896 if (!bdev_max_discard_sectors(bio->bi_bdev))
897 return false;
898 bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC;
899 bio->bi_iter.bi_size =
900 XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg));
901 }
902
903 return true;
904 }
905
906 int
xfs_zone_gc_reset_sync(struct xfs_rtgroup * rtg)907 xfs_zone_gc_reset_sync(
908 struct xfs_rtgroup *rtg)
909 {
910 int error = 0;
911 struct bio bio;
912
913 bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
914 REQ_OP_ZONE_RESET);
915 if (xfs_zone_gc_prepare_reset(&bio, rtg))
916 error = submit_bio_wait(&bio);
917 bio_uninit(&bio);
918
919 return error;
920 }
921
922 static void
xfs_zone_gc_reset_zones(struct xfs_zone_gc_data * data,struct xfs_group * reset_list)923 xfs_zone_gc_reset_zones(
924 struct xfs_zone_gc_data *data,
925 struct xfs_group *reset_list)
926 {
927 struct xfs_group *next = reset_list;
928
929 if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
930 xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
931 return;
932 }
933
934 do {
935 struct xfs_rtgroup *rtg = to_rtg(next);
936 struct xfs_gc_bio *chunk;
937 struct bio *bio;
938
939 xfs_log_force_inode(rtg_rmap(rtg));
940
941 next = rtg_group(rtg)->xg_next_reset;
942 rtg_group(rtg)->xg_next_reset = NULL;
943
944 bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
945 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
946 bio->bi_private = rtg;
947 bio->bi_end_io = xfs_zone_gc_end_io;
948
949 chunk = container_of(bio, struct xfs_gc_bio, bio);
950 chunk->data = data;
951 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
952 list_add_tail(&chunk->entry, &data->resetting);
953
954 /*
955 * Also use the bio to drive the state machine when neither
956 * zone reset nor discard is supported to keep things simple.
957 */
958 if (xfs_zone_gc_prepare_reset(bio, rtg))
959 submit_bio(bio);
960 else
961 bio_endio(bio);
962 } while (next);
963 }
964
965 /*
966 * Handle the work to read and write data for GC and to reset the zones,
967 * including handling all completions.
968 *
969 * Note that the order of the chunks is preserved so that we don't undo the
970 * optimal order established by xfs_zone_gc_query().
971 */
972 static bool
xfs_zone_gc_handle_work(struct xfs_zone_gc_data * data)973 xfs_zone_gc_handle_work(
974 struct xfs_zone_gc_data *data)
975 {
976 struct xfs_zone_info *zi = data->mp->m_zone_info;
977 struct xfs_gc_bio *chunk, *next;
978 struct xfs_group *reset_list;
979 struct blk_plug plug;
980
981 spin_lock(&zi->zi_reset_list_lock);
982 reset_list = zi->zi_reset_list;
983 zi->zi_reset_list = NULL;
984 spin_unlock(&zi->zi_reset_list_lock);
985
986 if (!xfs_zone_gc_select_victim(data) ||
987 !xfs_zone_gc_space_available(data)) {
988 if (list_empty(&data->reading) &&
989 list_empty(&data->writing) &&
990 list_empty(&data->resetting) &&
991 !reset_list)
992 return false;
993 }
994
995 __set_current_state(TASK_RUNNING);
996 try_to_freeze();
997
998 if (reset_list)
999 xfs_zone_gc_reset_zones(data, reset_list);
1000
1001 list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
1002 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1003 break;
1004 xfs_zone_gc_finish_reset(chunk);
1005 }
1006
1007 list_for_each_entry_safe(chunk, next, &data->writing, entry) {
1008 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1009 break;
1010 xfs_zone_gc_finish_chunk(chunk);
1011 }
1012
1013 blk_start_plug(&plug);
1014 list_for_each_entry_safe(chunk, next, &data->reading, entry) {
1015 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1016 break;
1017 xfs_zone_gc_write_chunk(chunk);
1018 }
1019 blk_finish_plug(&plug);
1020
1021 blk_start_plug(&plug);
1022 while (xfs_zone_gc_start_chunk(data))
1023 ;
1024 blk_finish_plug(&plug);
1025 return true;
1026 }
1027
1028 /*
1029 * Note that the current GC algorithm would break reflinks and thus duplicate
1030 * data that was shared by multiple owners before. Because of that reflinks
1031 * are currently not supported on zoned file systems and can't be created or
1032 * mounted.
1033 */
1034 static int
xfs_zoned_gcd(void * private)1035 xfs_zoned_gcd(
1036 void *private)
1037 {
1038 struct xfs_zone_gc_data *data = private;
1039 struct xfs_mount *mp = data->mp;
1040 struct xfs_zone_info *zi = mp->m_zone_info;
1041 unsigned int nofs_flag;
1042
1043 nofs_flag = memalloc_nofs_save();
1044 set_freezable();
1045
1046 for (;;) {
1047 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1048 xfs_set_zonegc_running(mp);
1049 if (xfs_zone_gc_handle_work(data))
1050 continue;
1051
1052 if (list_empty(&data->reading) &&
1053 list_empty(&data->writing) &&
1054 list_empty(&data->resetting) &&
1055 !zi->zi_reset_list) {
1056 xfs_clear_zonegc_running(mp);
1057 xfs_zoned_resv_wake_all(mp);
1058
1059 if (kthread_should_stop()) {
1060 __set_current_state(TASK_RUNNING);
1061 break;
1062 }
1063
1064 if (kthread_should_park()) {
1065 __set_current_state(TASK_RUNNING);
1066 kthread_parkme();
1067 continue;
1068 }
1069 }
1070
1071 schedule();
1072 }
1073 xfs_clear_zonegc_running(mp);
1074
1075 if (data->iter.victim_rtg)
1076 xfs_rtgroup_rele(data->iter.victim_rtg);
1077
1078 memalloc_nofs_restore(nofs_flag);
1079 xfs_zone_gc_data_free(data);
1080 return 0;
1081 }
1082
1083 void
xfs_zone_gc_start(struct xfs_mount * mp)1084 xfs_zone_gc_start(
1085 struct xfs_mount *mp)
1086 {
1087 if (xfs_has_zoned(mp))
1088 kthread_unpark(mp->m_zone_info->zi_gc_thread);
1089 }
1090
1091 void
xfs_zone_gc_stop(struct xfs_mount * mp)1092 xfs_zone_gc_stop(
1093 struct xfs_mount *mp)
1094 {
1095 if (xfs_has_zoned(mp))
1096 kthread_park(mp->m_zone_info->zi_gc_thread);
1097 }
1098
1099 int
xfs_zone_gc_mount(struct xfs_mount * mp)1100 xfs_zone_gc_mount(
1101 struct xfs_mount *mp)
1102 {
1103 struct xfs_zone_info *zi = mp->m_zone_info;
1104 struct xfs_zone_gc_data *data;
1105 struct xfs_open_zone *oz;
1106 int error;
1107
1108 /*
1109 * If there are no free zones available for GC, pick the open zone with
1110 * the least used space to GC into. This should only happen after an
1111 * unclean shutdown near ENOSPC while GC was ongoing.
1112 *
1113 * We also need to do this for the first gc zone allocation if we
1114 * unmounted while at the open limit.
1115 */
1116 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
1117 zi->zi_nr_open_zones == mp->m_max_open_zones)
1118 oz = xfs_zone_gc_steal_open(zi);
1119 else
1120 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
1121 if (!oz) {
1122 xfs_warn(mp, "unable to allocate a zone for gc");
1123 error = -EIO;
1124 goto out;
1125 }
1126
1127 trace_xfs_zone_gc_target_opened(oz->oz_rtg);
1128 zi->zi_open_gc_zone = oz;
1129
1130 data = xfs_zone_gc_data_alloc(mp);
1131 if (!data) {
1132 error = -ENOMEM;
1133 goto out_put_gc_zone;
1134 }
1135
1136 mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
1137 "xfs-zone-gc/%s", mp->m_super->s_id);
1138 if (IS_ERR(mp->m_zone_info->zi_gc_thread)) {
1139 xfs_warn(mp, "unable to create zone gc thread");
1140 error = PTR_ERR(mp->m_zone_info->zi_gc_thread);
1141 goto out_free_gc_data;
1142 }
1143
1144 /* xfs_zone_gc_start will unpark for rw mounts */
1145 kthread_park(mp->m_zone_info->zi_gc_thread);
1146 return 0;
1147
1148 out_free_gc_data:
1149 kfree(data);
1150 out_put_gc_zone:
1151 xfs_open_zone_put(zi->zi_open_gc_zone);
1152 out:
1153 return error;
1154 }
1155
1156 void
xfs_zone_gc_unmount(struct xfs_mount * mp)1157 xfs_zone_gc_unmount(
1158 struct xfs_mount *mp)
1159 {
1160 struct xfs_zone_info *zi = mp->m_zone_info;
1161
1162 kthread_stop(zi->zi_gc_thread);
1163 if (zi->zi_open_gc_zone)
1164 xfs_open_zone_put(zi->zi_open_gc_zone);
1165 }
1166