1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2023-2025 Christoph Hellwig.
4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5 */
6 #include "xfs_platform.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "xfs_trans.h"
15 #include "xfs_icache.h"
16 #include "xfs_rmap.h"
17 #include "xfs_rtbitmap.h"
18 #include "xfs_rtrmap_btree.h"
19 #include "xfs_errortag.h"
20 #include "xfs_error.h"
21 #include "xfs_zone_alloc.h"
22 #include "xfs_zone_priv.h"
23 #include "xfs_zones.h"
24 #include "xfs_trace.h"
25
26 /*
27 * Implement Garbage Collection (GC) of partially used zoned.
28 *
29 * To support the purely sequential writes in each zone, zoned XFS needs to be
30 * able to move data remaining in a zone out of it to reset the zone to prepare
31 * for writing to it again.
32 *
33 * This is done by the GC thread implemented in this file. To support that a
34 * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
35 * write the garbage collected data into.
36 *
37 * Whenever the available space is below the chosen threshold, the GC thread
38 * looks for potential non-empty but not fully used zones that are worth
39 * reclaiming. Once found the rmap for the victim zone is queried, and after
40 * a bit of sorting to reduce fragmentation, the still live extents are read
41 * into memory and written to the GC target zone, and the bmap btree of the
42 * files is updated to point to the new location. To avoid taking the IOLOCK
43 * and MMAPLOCK for the entire GC process and thus affecting the latency of
44 * user reads and writes to the files, the GC writes are speculative and the
45 * I/O completion checks that no other writes happened for the affected regions
46 * before remapping.
47 *
48 * Once a zone does not contain any valid data, be that through GC or user
49 * block removal, it is queued for for a zone reset. The reset operation
50 * carefully ensures that the RT device cache is flushed and all transactions
51 * referencing the rmap have been committed to disk.
52 */
53
54 /*
55 * Size of each GC scratch allocation, and the number of buffers.
56 */
57 #define XFS_GC_BUF_SIZE SZ_1M
58 #define XFS_GC_NR_BUFS 2
59 static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS);
60
61 /*
62 * Chunk that is read and written for each GC operation.
63 *
64 * Note that for writes to actual zoned devices, the chunk can be split when
65 * reaching the hardware limit.
66 */
67 struct xfs_gc_bio {
68 struct xfs_zone_gc_data *data;
69
70 /*
71 * Entry into the reading/writing/resetting list. Only accessed from
72 * the GC thread, so no locking needed.
73 */
74 struct list_head entry;
75
76 /*
77 * State of this gc_bio. Done means the current I/O completed.
78 * Set from the bio end I/O handler, read from the GC thread.
79 */
80 enum {
81 XFS_GC_BIO_NEW,
82 XFS_GC_BIO_DONE,
83 } state;
84
85 /*
86 * Pointer to the inode and byte range in the inode that this
87 * GC chunk is operating on.
88 */
89 struct xfs_inode *ip;
90 loff_t offset;
91 unsigned int len;
92
93 /*
94 * Existing startblock (in the zone to be freed) and newly assigned
95 * daddr in the zone GCed into.
96 */
97 xfs_fsblock_t old_startblock;
98 xfs_daddr_t new_daddr;
99
100 /* Are we writing to a sequential write required zone? */
101 bool is_seq;
102
103 /* Open Zone being written to */
104 struct xfs_open_zone *oz;
105
106 struct xfs_rtgroup *victim_rtg;
107
108 /* Bio used for reads and writes, including the bvec used by it */
109 struct bio bio; /* must be last */
110 };
111
112 #define XFS_ZONE_GC_RECS 1024
113
114 /* iterator, needs to be reinitialized for each victim zone */
115 struct xfs_zone_gc_iter {
116 struct xfs_rtgroup *victim_rtg;
117 unsigned int rec_count;
118 unsigned int rec_idx;
119 xfs_agblock_t next_startblock;
120 struct xfs_rmap_irec *recs;
121 };
122
123 /*
124 * Per-mount GC state.
125 */
126 struct xfs_zone_gc_data {
127 struct xfs_mount *mp;
128
129 /* bioset used to allocate the gc_bios */
130 struct bio_set bio_set;
131
132 /*
133 * Scratchpad to buffer GC data, organized as a ring buffer over
134 * discontiguous folios. scratch_head is where the buffer is filled,
135 * scratch_tail tracks the buffer space freed, and scratch_available
136 * counts the space available in the ring buffer between the head and
137 * the tail.
138 */
139 struct folio *scratch_folios[XFS_GC_NR_BUFS];
140 unsigned int scratch_size;
141 unsigned int scratch_available;
142 unsigned int scratch_head;
143 unsigned int scratch_tail;
144
145 /*
146 * List of bios currently being read, written and reset.
147 * These lists are only accessed by the GC thread itself, and must only
148 * be processed in order.
149 */
150 struct list_head reading;
151 struct list_head writing;
152 struct list_head resetting;
153
154 /*
155 * Iterator for the victim zone.
156 */
157 struct xfs_zone_gc_iter iter;
158 };
159
160 /*
161 * We aim to keep enough zones free in stock to fully use the open zone limit
162 * for data placement purposes. Additionally, the m_zonegc_low_space tunable
163 * can be set to make sure a fraction of the unused blocks are available for
164 * writing.
165 */
166 bool
xfs_zoned_need_gc(struct xfs_mount * mp)167 xfs_zoned_need_gc(
168 struct xfs_mount *mp)
169 {
170 s64 available, free, threshold;
171 s32 remainder;
172
173 if (!xfs_zoned_have_reclaimable(mp->m_zone_info))
174 return false;
175
176 available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
177
178 if (available <
179 xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
180 return true;
181
182 free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
183
184 threshold = div_s64_rem(free, 100, &remainder);
185 threshold = threshold * mp->m_zonegc_low_space +
186 remainder * div_s64(mp->m_zonegc_low_space, 100);
187
188 if (available < threshold)
189 return true;
190
191 return false;
192 }
193
194 static struct xfs_zone_gc_data *
xfs_zone_gc_data_alloc(struct xfs_mount * mp)195 xfs_zone_gc_data_alloc(
196 struct xfs_mount *mp)
197 {
198 struct xfs_zone_gc_data *data;
199 int i;
200
201 data = kzalloc_obj(*data);
202 if (!data)
203 return NULL;
204 data->iter.recs = kzalloc_objs(*data->iter.recs, XFS_ZONE_GC_RECS);
205 if (!data->iter.recs)
206 goto out_free_data;
207
208 if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
209 BIOSET_NEED_BVECS))
210 goto out_free_recs;
211 for (i = 0; i < XFS_GC_NR_BUFS; i++) {
212 data->scratch_folios[i] =
213 folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE));
214 if (!data->scratch_folios[i])
215 goto out_free_scratch;
216 }
217 data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS;
218 data->scratch_available = data->scratch_size;
219 INIT_LIST_HEAD(&data->reading);
220 INIT_LIST_HEAD(&data->writing);
221 INIT_LIST_HEAD(&data->resetting);
222 data->mp = mp;
223 return data;
224
225 out_free_scratch:
226 while (--i >= 0)
227 folio_put(data->scratch_folios[i]);
228 bioset_exit(&data->bio_set);
229 out_free_recs:
230 kfree(data->iter.recs);
231 out_free_data:
232 kfree(data);
233 return NULL;
234 }
235
236 static void
xfs_zone_gc_data_free(struct xfs_zone_gc_data * data)237 xfs_zone_gc_data_free(
238 struct xfs_zone_gc_data *data)
239 {
240 int i;
241
242 for (i = 0; i < XFS_GC_NR_BUFS; i++)
243 folio_put(data->scratch_folios[i]);
244 bioset_exit(&data->bio_set);
245 kfree(data->iter.recs);
246 kfree(data);
247 }
248
249 static void
xfs_zone_gc_iter_init(struct xfs_zone_gc_iter * iter,struct xfs_rtgroup * victim_rtg)250 xfs_zone_gc_iter_init(
251 struct xfs_zone_gc_iter *iter,
252 struct xfs_rtgroup *victim_rtg)
253
254 {
255 iter->next_startblock = 0;
256 iter->rec_count = 0;
257 iter->rec_idx = 0;
258 iter->victim_rtg = victim_rtg;
259 atomic_inc(&victim_rtg->rtg_gccount);
260 }
261
262 /*
263 * Query the rmap of the victim zone to gather the records to evacuate.
264 */
265 static int
xfs_zone_gc_query_cb(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * irec,void * private)266 xfs_zone_gc_query_cb(
267 struct xfs_btree_cur *cur,
268 const struct xfs_rmap_irec *irec,
269 void *private)
270 {
271 struct xfs_zone_gc_iter *iter = private;
272
273 ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
274 ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
275 ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
276
277 iter->recs[iter->rec_count] = *irec;
278 if (++iter->rec_count == XFS_ZONE_GC_RECS) {
279 iter->next_startblock =
280 irec->rm_startblock + irec->rm_blockcount;
281 return 1;
282 }
283 return 0;
284 }
285
286 static int
xfs_zone_gc_rmap_rec_cmp(const void * a,const void * b)287 xfs_zone_gc_rmap_rec_cmp(
288 const void *a,
289 const void *b)
290 {
291 const struct xfs_rmap_irec *reca = a;
292 const struct xfs_rmap_irec *recb = b;
293 int diff;
294
295 diff = cmp_int(reca->rm_owner, recb->rm_owner);
296 if (diff)
297 return diff;
298 return cmp_int(reca->rm_offset, recb->rm_offset);
299 }
300
301 static int
xfs_zone_gc_query(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter)302 xfs_zone_gc_query(
303 struct xfs_mount *mp,
304 struct xfs_zone_gc_iter *iter)
305 {
306 struct xfs_rtgroup *rtg = iter->victim_rtg;
307 struct xfs_rmap_irec ri_low = { };
308 struct xfs_rmap_irec ri_high;
309 struct xfs_btree_cur *cur;
310 struct xfs_trans *tp;
311 int error;
312
313 ASSERT(iter->next_startblock <= rtg_blocks(rtg));
314 if (iter->next_startblock == rtg_blocks(rtg))
315 goto done;
316
317 ASSERT(iter->next_startblock < rtg_blocks(rtg));
318 ri_low.rm_startblock = iter->next_startblock;
319 memset(&ri_high, 0xFF, sizeof(ri_high));
320
321 iter->rec_idx = 0;
322 iter->rec_count = 0;
323
324 tp = xfs_trans_alloc_empty(mp);
325 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
326 cur = xfs_rtrmapbt_init_cursor(tp, rtg);
327 error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
328 xfs_zone_gc_query_cb, iter);
329 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
330 xfs_btree_del_cursor(cur, error < 0 ? error : 0);
331 xfs_trans_cancel(tp);
332
333 if (error < 0)
334 return error;
335
336 /*
337 * Sort the rmap records by inode number and increasing offset to
338 * defragment the mappings.
339 *
340 * This could be further enhanced by an even bigger look ahead window,
341 * but that's better left until we have better detection of changes to
342 * inode mapping to avoid the potential of GCing already dead data.
343 */
344 sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
345 xfs_zone_gc_rmap_rec_cmp, NULL);
346
347 if (error == 0) {
348 /*
349 * We finished iterating through the zone.
350 */
351 iter->next_startblock = rtg_blocks(rtg);
352 if (iter->rec_count == 0)
353 goto done;
354 }
355
356 return 0;
357 done:
358 atomic_dec(&iter->victim_rtg->rtg_gccount);
359 xfs_rtgroup_rele(iter->victim_rtg);
360 iter->victim_rtg = NULL;
361 return 0;
362 }
363
364 static bool
xfs_zone_gc_iter_next(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter,struct xfs_rmap_irec * chunk_rec,struct xfs_inode ** ipp)365 xfs_zone_gc_iter_next(
366 struct xfs_mount *mp,
367 struct xfs_zone_gc_iter *iter,
368 struct xfs_rmap_irec *chunk_rec,
369 struct xfs_inode **ipp)
370 {
371 struct xfs_rmap_irec *irec;
372 int error;
373
374 if (!iter->victim_rtg)
375 return false;
376
377 retry:
378 if (iter->rec_idx == iter->rec_count) {
379 error = xfs_zone_gc_query(mp, iter);
380 if (error)
381 goto fail;
382 if (!iter->victim_rtg)
383 return false;
384 }
385
386 irec = &iter->recs[iter->rec_idx];
387 error = xfs_iget(mp, NULL, irec->rm_owner,
388 XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
389 if (error) {
390 /*
391 * If the inode was already deleted, skip over it.
392 */
393 if (error == -ENOENT) {
394 iter->rec_idx++;
395 goto retry;
396 }
397 goto fail;
398 }
399
400 if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
401 iter->rec_idx++;
402 xfs_irele(*ipp);
403 goto retry;
404 }
405
406 *chunk_rec = *irec;
407 return true;
408
409 fail:
410 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
411 return false;
412 }
413
414 static void
xfs_zone_gc_iter_advance(struct xfs_zone_gc_iter * iter,xfs_extlen_t count_fsb)415 xfs_zone_gc_iter_advance(
416 struct xfs_zone_gc_iter *iter,
417 xfs_extlen_t count_fsb)
418 {
419 struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx];
420
421 irec->rm_offset += count_fsb;
422 irec->rm_startblock += count_fsb;
423 irec->rm_blockcount -= count_fsb;
424 if (!irec->rm_blockcount)
425 iter->rec_idx++;
426 }
427
428 static struct xfs_rtgroup *
xfs_zone_gc_pick_victim_from(struct xfs_mount * mp,uint32_t bucket)429 xfs_zone_gc_pick_victim_from(
430 struct xfs_mount *mp,
431 uint32_t bucket)
432 {
433 struct xfs_zone_info *zi = mp->m_zone_info;
434 uint32_t victim_used = U32_MAX;
435 struct xfs_rtgroup *victim_rtg = NULL;
436 uint32_t bit;
437
438 if (!zi->zi_used_bucket_entries[bucket])
439 return NULL;
440
441 for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
442 mp->m_sb.sb_rgcount) {
443 struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
444
445 if (!rtg)
446 continue;
447
448 /*
449 * If the zone is already undergoing GC, don't pick it again.
450 *
451 * This prevents us from picking one of the zones for which we
452 * already submitted GC I/O, but for which the remapping hasn't
453 * concluded yet. This won't cause data corruption, but
454 * increases write amplification and slows down GC, so this is
455 * a bad thing.
456 */
457 if (atomic_read(&rtg->rtg_gccount)) {
458 xfs_rtgroup_rele(rtg);
459 continue;
460 }
461
462 /* skip zones that are just waiting for a reset */
463 if (rtg_rmap(rtg)->i_used_blocks == 0 ||
464 rtg_rmap(rtg)->i_used_blocks >= victim_used) {
465 xfs_rtgroup_rele(rtg);
466 continue;
467 }
468
469 if (victim_rtg)
470 xfs_rtgroup_rele(victim_rtg);
471 victim_rtg = rtg;
472 victim_used = rtg_rmap(rtg)->i_used_blocks;
473
474 /*
475 * Any zone that is less than 1 percent used is fair game for
476 * instant reclaim. All of these zones are in the last
477 * bucket, so avoid the expensive division for the zones
478 * in the other buckets.
479 */
480 if (bucket == 0 &&
481 rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
482 break;
483 }
484
485 return victim_rtg;
486 }
487
488 /*
489 * Iterate through all zones marked as reclaimable and find a candidate to
490 * reclaim.
491 */
492 static bool
xfs_zone_gc_select_victim(struct xfs_zone_gc_data * data)493 xfs_zone_gc_select_victim(
494 struct xfs_zone_gc_data *data)
495 {
496 struct xfs_zone_gc_iter *iter = &data->iter;
497 struct xfs_mount *mp = data->mp;
498 struct xfs_zone_info *zi = mp->m_zone_info;
499 struct xfs_rtgroup *victim_rtg = NULL;
500 unsigned int bucket;
501
502 spin_lock(&zi->zi_used_buckets_lock);
503 for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
504 victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
505 if (victim_rtg)
506 break;
507 }
508 spin_unlock(&zi->zi_used_buckets_lock);
509
510 if (!victim_rtg)
511 return false;
512
513 trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
514 xfs_zone_gc_iter_init(iter, victim_rtg);
515 return true;
516 }
517
518 static struct xfs_open_zone *
xfs_zone_gc_steal_open(struct xfs_zone_info * zi)519 xfs_zone_gc_steal_open(
520 struct xfs_zone_info *zi)
521 {
522 struct xfs_open_zone *oz, *found = NULL;
523
524 spin_lock(&zi->zi_open_zones_lock);
525 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
526 if (!found || oz->oz_allocated < found->oz_allocated)
527 found = oz;
528 }
529
530 if (found) {
531 found->oz_is_gc = true;
532 list_del_init(&found->oz_entry);
533 zi->zi_nr_open_zones--;
534 }
535
536 spin_unlock(&zi->zi_open_zones_lock);
537 return found;
538 }
539
540 static struct xfs_open_zone *
xfs_zone_gc_select_target(struct xfs_mount * mp)541 xfs_zone_gc_select_target(
542 struct xfs_mount *mp)
543 {
544 struct xfs_zone_info *zi = mp->m_zone_info;
545 struct xfs_open_zone *oz = zi->zi_open_gc_zone;
546
547 /*
548 * We need to wait for pending writes to finish.
549 */
550 if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
551 return NULL;
552
553 ASSERT(zi->zi_nr_open_zones <=
554 mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
555 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
556 if (oz)
557 trace_xfs_zone_gc_target_opened(oz->oz_rtg);
558 spin_lock(&zi->zi_open_zones_lock);
559 zi->zi_open_gc_zone = oz;
560 spin_unlock(&zi->zi_open_zones_lock);
561 return oz;
562 }
563
564 /*
565 * Ensure we have a valid open zone to write the GC data to.
566 *
567 * If the current target zone has space keep writing to it, else first wait for
568 * all pending writes and then pick a new one.
569 */
570 static struct xfs_open_zone *
xfs_zone_gc_ensure_target(struct xfs_mount * mp)571 xfs_zone_gc_ensure_target(
572 struct xfs_mount *mp)
573 {
574 struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone;
575
576 if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
577 return xfs_zone_gc_select_target(mp);
578 return oz;
579 }
580
581 static void
xfs_zone_gc_end_io(struct bio * bio)582 xfs_zone_gc_end_io(
583 struct bio *bio)
584 {
585 struct xfs_gc_bio *chunk =
586 container_of(bio, struct xfs_gc_bio, bio);
587 struct xfs_zone_gc_data *data = chunk->data;
588
589 WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
590 wake_up_process(data->mp->m_zone_info->zi_gc_thread);
591 }
592
593 static struct xfs_open_zone *
xfs_zone_gc_alloc_blocks(struct xfs_zone_gc_data * data,xfs_extlen_t * count_fsb,xfs_daddr_t * daddr,bool * is_seq)594 xfs_zone_gc_alloc_blocks(
595 struct xfs_zone_gc_data *data,
596 xfs_extlen_t *count_fsb,
597 xfs_daddr_t *daddr,
598 bool *is_seq)
599 {
600 struct xfs_mount *mp = data->mp;
601 struct xfs_open_zone *oz;
602
603 oz = xfs_zone_gc_ensure_target(mp);
604 if (!oz)
605 return NULL;
606
607 *count_fsb = min(*count_fsb, XFS_B_TO_FSB(mp, data->scratch_available));
608
609 /*
610 * Directly allocate GC blocks from the reserved pool.
611 *
612 * If we'd take them from the normal pool we could be stealing blocks
613 * from a regular writer, which would then have to wait for GC and
614 * deadlock.
615 */
616 spin_lock(&mp->m_sb_lock);
617 *count_fsb = min(*count_fsb,
618 rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
619 *count_fsb = min3(*count_fsb,
620 mp->m_free[XC_FREE_RTEXTENTS].res_avail,
621 mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
622 mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
623 mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
624 spin_unlock(&mp->m_sb_lock);
625
626 if (!*count_fsb)
627 return NULL;
628
629 *daddr = xfs_gbno_to_daddr(rtg_group(oz->oz_rtg), 0);
630 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
631 if (!*is_seq)
632 *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
633 oz->oz_allocated += *count_fsb;
634 atomic_inc(&oz->oz_ref);
635 return oz;
636 }
637
638 static void
xfs_zone_gc_add_data(struct xfs_gc_bio * chunk)639 xfs_zone_gc_add_data(
640 struct xfs_gc_bio *chunk)
641 {
642 struct xfs_zone_gc_data *data = chunk->data;
643 unsigned int len = chunk->len;
644 unsigned int off = data->scratch_head;
645
646 do {
647 unsigned int this_off = off % XFS_GC_BUF_SIZE;
648 unsigned int this_len = min(len, XFS_GC_BUF_SIZE - this_off);
649
650 bio_add_folio_nofail(&chunk->bio,
651 data->scratch_folios[off / XFS_GC_BUF_SIZE],
652 this_len, this_off);
653 len -= this_len;
654 off += this_len;
655 if (off == data->scratch_size)
656 off = 0;
657 } while (len);
658 }
659
660 static bool
xfs_zone_gc_start_chunk(struct xfs_zone_gc_data * data)661 xfs_zone_gc_start_chunk(
662 struct xfs_zone_gc_data *data)
663 {
664 struct xfs_zone_gc_iter *iter = &data->iter;
665 struct xfs_mount *mp = data->mp;
666 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
667 struct xfs_open_zone *oz;
668 struct xfs_rmap_irec irec;
669 struct xfs_gc_bio *chunk;
670 struct xfs_inode *ip;
671 struct bio *bio;
672 xfs_daddr_t daddr;
673 unsigned int len;
674 bool is_seq;
675
676 if (xfs_is_shutdown(mp))
677 return false;
678
679 if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
680 return false;
681 oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
682 &is_seq);
683 if (!oz) {
684 xfs_irele(ip);
685 return false;
686 }
687
688 len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
689 bio = bio_alloc_bioset(bdev,
690 min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS),
691 REQ_OP_READ, GFP_NOFS, &data->bio_set);
692
693 chunk = container_of(bio, struct xfs_gc_bio, bio);
694 chunk->ip = ip;
695 chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
696 chunk->len = len;
697 chunk->old_startblock =
698 xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
699 chunk->new_daddr = daddr;
700 chunk->is_seq = is_seq;
701 chunk->data = data;
702 chunk->oz = oz;
703 chunk->victim_rtg = iter->victim_rtg;
704 atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref);
705 atomic_inc(&chunk->victim_rtg->rtg_gccount);
706
707 bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
708 bio->bi_end_io = xfs_zone_gc_end_io;
709 xfs_zone_gc_add_data(chunk);
710 data->scratch_head = (data->scratch_head + len) % data->scratch_size;
711 data->scratch_available -= len;
712
713 XFS_STATS_INC(mp, xs_gc_read_calls);
714
715 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
716 list_add_tail(&chunk->entry, &data->reading);
717 xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
718
719 submit_bio(bio);
720 return true;
721 }
722
723 static void
xfs_zone_gc_free_chunk(struct xfs_gc_bio * chunk)724 xfs_zone_gc_free_chunk(
725 struct xfs_gc_bio *chunk)
726 {
727 atomic_dec(&chunk->victim_rtg->rtg_gccount);
728 xfs_rtgroup_rele(chunk->victim_rtg);
729 list_del(&chunk->entry);
730 xfs_open_zone_put(chunk->oz);
731 xfs_irele(chunk->ip);
732 bio_put(&chunk->bio);
733 }
734
735 static void
xfs_zone_gc_submit_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)736 xfs_zone_gc_submit_write(
737 struct xfs_zone_gc_data *data,
738 struct xfs_gc_bio *chunk)
739 {
740 if (chunk->is_seq) {
741 chunk->bio.bi_opf &= ~REQ_OP_WRITE;
742 chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
743 }
744 chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
745 chunk->bio.bi_end_io = xfs_zone_gc_end_io;
746 submit_bio(&chunk->bio);
747 }
748
749 static struct xfs_gc_bio *
xfs_zone_gc_split_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)750 xfs_zone_gc_split_write(
751 struct xfs_zone_gc_data *data,
752 struct xfs_gc_bio *chunk)
753 {
754 struct queue_limits *lim =
755 &bdev_get_queue(chunk->bio.bi_bdev)->limits;
756 struct xfs_gc_bio *split_chunk;
757 int split_sectors;
758 unsigned int split_len;
759 struct bio *split;
760 unsigned int nsegs;
761
762 if (!chunk->is_seq)
763 return NULL;
764
765 split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
766 lim->max_zone_append_sectors << SECTOR_SHIFT);
767 if (!split_sectors)
768 return NULL;
769
770 /* ensure the split chunk is still block size aligned */
771 split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
772 data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
773 split_len = split_sectors << SECTOR_SHIFT;
774
775 split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
776 split_chunk = container_of(split, struct xfs_gc_bio, bio);
777 split_chunk->data = data;
778 ihold(VFS_I(chunk->ip));
779 split_chunk->ip = chunk->ip;
780 split_chunk->is_seq = chunk->is_seq;
781 split_chunk->offset = chunk->offset;
782 split_chunk->len = split_len;
783 split_chunk->old_startblock = chunk->old_startblock;
784 split_chunk->new_daddr = chunk->new_daddr;
785 split_chunk->oz = chunk->oz;
786 atomic_inc(&chunk->oz->oz_ref);
787
788 split_chunk->victim_rtg = chunk->victim_rtg;
789 atomic_inc(&rtg_group(chunk->victim_rtg)->xg_active_ref);
790 atomic_inc(&chunk->victim_rtg->rtg_gccount);
791
792 chunk->offset += split_len;
793 chunk->len -= split_len;
794 chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
795
796 /* add right before the original chunk */
797 WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
798 list_add_tail(&split_chunk->entry, &chunk->entry);
799 return split_chunk;
800 }
801
802 static void
xfs_zone_gc_write_chunk(struct xfs_gc_bio * chunk)803 xfs_zone_gc_write_chunk(
804 struct xfs_gc_bio *chunk)
805 {
806 struct xfs_zone_gc_data *data = chunk->data;
807 struct xfs_mount *mp = chunk->ip->i_mount;
808 struct xfs_gc_bio *split_chunk;
809
810 if (chunk->bio.bi_status)
811 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
812 if (xfs_is_shutdown(mp)) {
813 xfs_zone_gc_free_chunk(chunk);
814 return;
815 }
816
817 XFS_STATS_INC(mp, xs_gc_write_calls);
818 XFS_STATS_ADD(mp, xs_gc_bytes, chunk->len);
819
820 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
821 list_move_tail(&chunk->entry, &data->writing);
822
823 bio_reuse(&chunk->bio, REQ_OP_WRITE);
824 while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
825 xfs_zone_gc_submit_write(data, split_chunk);
826 xfs_zone_gc_submit_write(data, chunk);
827 }
828
829 static void
xfs_zone_gc_finish_chunk(struct xfs_gc_bio * chunk)830 xfs_zone_gc_finish_chunk(
831 struct xfs_gc_bio *chunk)
832 {
833 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
834 struct xfs_zone_gc_data *data = chunk->data;
835 struct xfs_inode *ip = chunk->ip;
836 struct xfs_mount *mp = ip->i_mount;
837 int error;
838
839 if (chunk->bio.bi_status)
840 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
841 if (xfs_is_shutdown(mp)) {
842 xfs_zone_gc_free_chunk(chunk);
843 return;
844 }
845
846 data->scratch_tail =
847 (data->scratch_tail + chunk->len) % data->scratch_size;
848 data->scratch_available += chunk->len;
849
850 /*
851 * Cycle through the iolock and wait for direct I/O and layouts to
852 * ensure no one is reading from the old mapping before it goes away.
853 *
854 * Note that xfs_zoned_end_io() below checks that no other writer raced
855 * with us to update the mapping by checking that the old startblock
856 * didn't change.
857 */
858 xfs_ilock(ip, iolock);
859 error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
860 if (!error)
861 inode_dio_wait(VFS_I(ip));
862 xfs_iunlock(ip, iolock);
863 if (error)
864 goto free;
865
866 if (chunk->is_seq)
867 chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
868 error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
869 chunk->new_daddr, chunk->oz, chunk->old_startblock);
870 free:
871 if (error)
872 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
873 xfs_zone_gc_free_chunk(chunk);
874 }
875
876 static void
xfs_zone_gc_finish_reset(struct xfs_gc_bio * chunk)877 xfs_zone_gc_finish_reset(
878 struct xfs_gc_bio *chunk)
879 {
880 struct xfs_rtgroup *rtg = chunk->bio.bi_private;
881 struct xfs_mount *mp = rtg_mount(rtg);
882 struct xfs_zone_info *zi = mp->m_zone_info;
883
884 if (chunk->bio.bi_status) {
885 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
886 goto out;
887 }
888
889 xfs_group_set_mark(rtg_group(rtg), XFS_RTG_FREE);
890 atomic_inc(&zi->zi_nr_free_zones);
891
892 xfs_zoned_add_available(mp, rtg_blocks(rtg));
893
894 wake_up_all(&zi->zi_zone_wait);
895 out:
896 list_del(&chunk->entry);
897 bio_put(&chunk->bio);
898 }
899
900 static void
xfs_submit_zone_reset_bio(struct xfs_rtgroup * rtg,struct bio * bio)901 xfs_submit_zone_reset_bio(
902 struct xfs_rtgroup *rtg,
903 struct bio *bio)
904 {
905 struct xfs_mount *mp = rtg_mount(rtg);
906
907 trace_xfs_zone_reset(rtg);
908
909 ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
910
911 if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ZONE_RESET)) {
912 bio_io_error(bio);
913 return;
914 }
915
916 XFS_STATS_INC(mp, xs_gc_zone_reset_calls);
917
918 bio->bi_iter.bi_sector = xfs_gbno_to_daddr(rtg_group(rtg), 0);
919 if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
920 /*
921 * Also use the bio to drive the state machine when neither
922 * zone reset nor discard is supported to keep things simple.
923 */
924 if (!bdev_max_discard_sectors(bio->bi_bdev)) {
925 bio_endio(bio);
926 return;
927 }
928 bio->bi_opf &= ~REQ_OP_ZONE_RESET;
929 bio->bi_opf |= REQ_OP_DISCARD;
930 bio->bi_iter.bi_size = XFS_FSB_TO_B(mp, rtg_blocks(rtg));
931 }
932
933 submit_bio(bio);
934 }
935
xfs_bio_wait_endio(struct bio * bio)936 static void xfs_bio_wait_endio(struct bio *bio)
937 {
938 complete(bio->bi_private);
939 }
940
941 int
xfs_zone_gc_reset_sync(struct xfs_rtgroup * rtg)942 xfs_zone_gc_reset_sync(
943 struct xfs_rtgroup *rtg)
944 {
945 DECLARE_COMPLETION_ONSTACK(done);
946 struct bio bio;
947 int error;
948
949 bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
950 REQ_OP_ZONE_RESET | REQ_SYNC);
951 bio.bi_private = &done;
952 bio.bi_end_io = xfs_bio_wait_endio;
953 xfs_submit_zone_reset_bio(rtg, &bio);
954 wait_for_completion_io(&done);
955
956 error = blk_status_to_errno(bio.bi_status);
957 bio_uninit(&bio);
958 return error;
959 }
960
961 static void
xfs_zone_gc_reset_zones(struct xfs_zone_gc_data * data,struct xfs_group * reset_list)962 xfs_zone_gc_reset_zones(
963 struct xfs_zone_gc_data *data,
964 struct xfs_group *reset_list)
965 {
966 struct xfs_group *next = reset_list;
967
968 if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
969 xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
970 return;
971 }
972
973 do {
974 struct xfs_rtgroup *rtg = to_rtg(next);
975 struct xfs_gc_bio *chunk;
976 struct bio *bio;
977
978 xfs_log_force_inode(rtg_rmap(rtg));
979
980 next = rtg_group(rtg)->xg_next_reset;
981 rtg_group(rtg)->xg_next_reset = NULL;
982
983 bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
984 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
985 bio->bi_private = rtg;
986 bio->bi_end_io = xfs_zone_gc_end_io;
987
988 chunk = container_of(bio, struct xfs_gc_bio, bio);
989 chunk->data = data;
990 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
991 list_add_tail(&chunk->entry, &data->resetting);
992 xfs_submit_zone_reset_bio(rtg, bio);
993 } while (next);
994 }
995
996 static bool
xfs_zone_gc_should_start_new_work(struct xfs_zone_gc_data * data)997 xfs_zone_gc_should_start_new_work(
998 struct xfs_zone_gc_data *data)
999 {
1000 struct xfs_open_zone *oz;
1001
1002 if (xfs_is_shutdown(data->mp))
1003 return false;
1004 if (!data->scratch_available)
1005 return false;
1006
1007 oz = xfs_zone_gc_ensure_target(data->mp);
1008 if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
1009 return false;
1010
1011 if (!data->iter.victim_rtg) {
1012 if (kthread_should_stop() || kthread_should_park())
1013 return false;
1014 if (!xfs_zoned_need_gc(data->mp))
1015 return false;
1016 if (!xfs_zone_gc_select_victim(data))
1017 return false;
1018 }
1019
1020 return true;
1021 }
1022
1023 /*
1024 * Handle the work to read and write data for GC and to reset the zones,
1025 * including handling all completions.
1026 *
1027 * Note that the order of the chunks is preserved so that we don't undo the
1028 * optimal order established by xfs_zone_gc_query().
1029 */
1030 static void
xfs_zone_gc_handle_work(struct xfs_zone_gc_data * data)1031 xfs_zone_gc_handle_work(
1032 struct xfs_zone_gc_data *data)
1033 {
1034 struct xfs_zone_info *zi = data->mp->m_zone_info;
1035 struct xfs_gc_bio *chunk, *next;
1036 struct xfs_group *reset_list;
1037 struct blk_plug plug;
1038
1039 spin_lock(&zi->zi_reset_list_lock);
1040 reset_list = zi->zi_reset_list;
1041 zi->zi_reset_list = NULL;
1042 spin_unlock(&zi->zi_reset_list_lock);
1043
1044 if (reset_list) {
1045 set_current_state(TASK_RUNNING);
1046 xfs_zone_gc_reset_zones(data, reset_list);
1047 }
1048
1049 list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
1050 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1051 break;
1052 set_current_state(TASK_RUNNING);
1053 xfs_zone_gc_finish_reset(chunk);
1054 }
1055
1056 list_for_each_entry_safe(chunk, next, &data->writing, entry) {
1057 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1058 break;
1059 set_current_state(TASK_RUNNING);
1060 xfs_zone_gc_finish_chunk(chunk);
1061 }
1062
1063 blk_start_plug(&plug);
1064 list_for_each_entry_safe(chunk, next, &data->reading, entry) {
1065 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1066 break;
1067 set_current_state(TASK_RUNNING);
1068 xfs_zone_gc_write_chunk(chunk);
1069 }
1070 blk_finish_plug(&plug);
1071
1072 if (xfs_zone_gc_should_start_new_work(data)) {
1073 set_current_state(TASK_RUNNING);
1074 blk_start_plug(&plug);
1075 while (xfs_zone_gc_start_chunk(data))
1076 ;
1077 blk_finish_plug(&plug);
1078 }
1079 }
1080
1081 /*
1082 * Note that the current GC algorithm would break reflinks and thus duplicate
1083 * data that was shared by multiple owners before. Because of that reflinks
1084 * are currently not supported on zoned file systems and can't be created or
1085 * mounted.
1086 */
1087 static int
xfs_zoned_gcd(void * private)1088 xfs_zoned_gcd(
1089 void *private)
1090 {
1091 struct xfs_zone_gc_data *data = private;
1092 struct xfs_mount *mp = data->mp;
1093 struct xfs_zone_info *zi = mp->m_zone_info;
1094 unsigned int nofs_flag;
1095
1096 nofs_flag = memalloc_nofs_save();
1097 set_freezable();
1098
1099 for (;;) {
1100 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1101 xfs_set_zonegc_running(mp);
1102
1103 xfs_zone_gc_handle_work(data);
1104
1105 /*
1106 * Only sleep if nothing set the state to running. Else check for
1107 * work again as someone might have queued up more work and woken
1108 * us in the meantime.
1109 */
1110 if (get_current_state() == TASK_RUNNING) {
1111 try_to_freeze();
1112 continue;
1113 }
1114
1115 if (list_empty(&data->reading) &&
1116 list_empty(&data->writing) &&
1117 list_empty(&data->resetting) &&
1118 !zi->zi_reset_list) {
1119 xfs_clear_zonegc_running(mp);
1120 xfs_zoned_resv_wake_all(mp);
1121
1122 if (kthread_should_stop()) {
1123 __set_current_state(TASK_RUNNING);
1124 break;
1125 }
1126
1127 if (kthread_should_park()) {
1128 __set_current_state(TASK_RUNNING);
1129 kthread_parkme();
1130 continue;
1131 }
1132 }
1133
1134 schedule();
1135 }
1136 xfs_clear_zonegc_running(mp);
1137
1138 if (data->iter.victim_rtg)
1139 xfs_rtgroup_rele(data->iter.victim_rtg);
1140
1141 memalloc_nofs_restore(nofs_flag);
1142 xfs_zone_gc_data_free(data);
1143 return 0;
1144 }
1145
1146 void
xfs_zone_gc_start(struct xfs_mount * mp)1147 xfs_zone_gc_start(
1148 struct xfs_mount *mp)
1149 {
1150 if (xfs_has_zoned(mp))
1151 kthread_unpark(mp->m_zone_info->zi_gc_thread);
1152 }
1153
1154 void
xfs_zone_gc_stop(struct xfs_mount * mp)1155 xfs_zone_gc_stop(
1156 struct xfs_mount *mp)
1157 {
1158 if (xfs_has_zoned(mp))
1159 kthread_park(mp->m_zone_info->zi_gc_thread);
1160 }
1161
1162 int
xfs_zone_gc_mount(struct xfs_mount * mp)1163 xfs_zone_gc_mount(
1164 struct xfs_mount *mp)
1165 {
1166 struct xfs_zone_info *zi = mp->m_zone_info;
1167 struct xfs_zone_gc_data *data;
1168 struct xfs_open_zone *oz;
1169 int error;
1170
1171 /*
1172 * If there are no free zones available for GC, pick the open zone with
1173 * the least used space to GC into. This should only happen after an
1174 * unclean shutdown near ENOSPC while GC was ongoing.
1175 *
1176 * We also need to do this for the first gc zone allocation if we
1177 * unmounted while at the open limit.
1178 */
1179 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
1180 zi->zi_nr_open_zones == mp->m_max_open_zones)
1181 oz = xfs_zone_gc_steal_open(zi);
1182 else
1183 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
1184 if (!oz) {
1185 xfs_warn(mp, "unable to allocate a zone for gc");
1186 error = -EIO;
1187 goto out;
1188 }
1189
1190 trace_xfs_zone_gc_target_opened(oz->oz_rtg);
1191 zi->zi_open_gc_zone = oz;
1192
1193 data = xfs_zone_gc_data_alloc(mp);
1194 if (!data) {
1195 error = -ENOMEM;
1196 goto out_put_gc_zone;
1197 }
1198
1199 zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
1200 "xfs-zone-gc/%s", mp->m_super->s_id);
1201 if (IS_ERR(zi->zi_gc_thread)) {
1202 xfs_warn(mp, "unable to create zone gc thread");
1203 error = PTR_ERR(zi->zi_gc_thread);
1204 goto out_free_gc_data;
1205 }
1206
1207 /* xfs_zone_gc_start will unpark for rw mounts */
1208 kthread_park(zi->zi_gc_thread);
1209 return 0;
1210
1211 out_free_gc_data:
1212 kfree(data);
1213 out_put_gc_zone:
1214 xfs_open_zone_put(zi->zi_open_gc_zone);
1215 out:
1216 return error;
1217 }
1218
1219 void
xfs_zone_gc_unmount(struct xfs_mount * mp)1220 xfs_zone_gc_unmount(
1221 struct xfs_mount *mp)
1222 {
1223 struct xfs_zone_info *zi = mp->m_zone_info;
1224
1225 kthread_stop(zi->zi_gc_thread);
1226 if (zi->zi_open_gc_zone)
1227 xfs_open_zone_put(zi->zi_open_gc_zone);
1228 }
1229