1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2023-2025 Christoph Hellwig.
4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5 */
6 #include "xfs_platform.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "xfs_trans.h"
15 #include "xfs_icache.h"
16 #include "xfs_rmap.h"
17 #include "xfs_rtbitmap.h"
18 #include "xfs_rtrmap_btree.h"
19 #include "xfs_errortag.h"
20 #include "xfs_error.h"
21 #include "xfs_zone_alloc.h"
22 #include "xfs_zone_priv.h"
23 #include "xfs_zones.h"
24 #include "xfs_trace.h"
25
26 /*
27 * Implement Garbage Collection (GC) of partially used zoned.
28 *
29 * To support the purely sequential writes in each zone, zoned XFS needs to be
30 * able to move data remaining in a zone out of it to reset the zone to prepare
31 * for writing to it again.
32 *
33 * This is done by the GC thread implemented in this file. To support that a
34 * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to
35 * write the garbage collected data into.
36 *
37 * Whenever the available space is below the chosen threshold, the GC thread
38 * looks for potential non-empty but not fully used zones that are worth
39 * reclaiming. Once found the rmap for the victim zone is queried, and after
40 * a bit of sorting to reduce fragmentation, the still live extents are read
41 * into memory and written to the GC target zone, and the bmap btree of the
42 * files is updated to point to the new location. To avoid taking the IOLOCK
43 * and MMAPLOCK for the entire GC process and thus affecting the latency of
44 * user reads and writes to the files, the GC writes are speculative and the
45 * I/O completion checks that no other writes happened for the affected regions
46 * before remapping.
47 *
48 * Once a zone does not contain any valid data, be that through GC or user
49 * block removal, it is queued for for a zone reset. The reset operation
50 * carefully ensures that the RT device cache is flushed and all transactions
51 * referencing the rmap have been committed to disk.
52 */
53
54 /*
55 * Size of each GC scratch allocation, and the number of buffers.
56 */
57 #define XFS_GC_BUF_SIZE SZ_1M
58 #define XFS_GC_NR_BUFS 2
59 static_assert(XFS_GC_NR_BUFS < BIO_MAX_VECS);
60
61 /*
62 * Chunk that is read and written for each GC operation.
63 *
64 * Note that for writes to actual zoned devices, the chunk can be split when
65 * reaching the hardware limit.
66 */
67 struct xfs_gc_bio {
68 struct xfs_zone_gc_data *data;
69
70 /*
71 * Entry into the reading/writing/resetting list. Only accessed from
72 * the GC thread, so no locking needed.
73 */
74 struct list_head entry;
75
76 /*
77 * State of this gc_bio. Done means the current I/O completed.
78 * Set from the bio end I/O handler, read from the GC thread.
79 */
80 enum {
81 XFS_GC_BIO_NEW,
82 XFS_GC_BIO_DONE,
83 } state;
84
85 /*
86 * Pointer to the inode and byte range in the inode that this
87 * GC chunk is operating on.
88 */
89 struct xfs_inode *ip;
90 loff_t offset;
91 unsigned int len;
92
93 /*
94 * Existing startblock (in the zone to be freed) and newly assigned
95 * daddr in the zone GCed into.
96 */
97 xfs_fsblock_t old_startblock;
98 xfs_daddr_t new_daddr;
99 struct xfs_zone_scratch *scratch;
100
101 /* Are we writing to a sequential write required zone? */
102 bool is_seq;
103
104 /* Open Zone being written to */
105 struct xfs_open_zone *oz;
106
107 struct xfs_rtgroup *victim_rtg;
108
109 /* Bio used for reads and writes, including the bvec used by it */
110 struct bio bio; /* must be last */
111 };
112
113 #define XFS_ZONE_GC_RECS 1024
114
115 /* iterator, needs to be reinitialized for each victim zone */
116 struct xfs_zone_gc_iter {
117 struct xfs_rtgroup *victim_rtg;
118 unsigned int rec_count;
119 unsigned int rec_idx;
120 xfs_agblock_t next_startblock;
121 struct xfs_rmap_irec *recs;
122 };
123
124 /*
125 * Per-mount GC state.
126 */
127 struct xfs_zone_gc_data {
128 struct xfs_mount *mp;
129
130 /* bioset used to allocate the gc_bios */
131 struct bio_set bio_set;
132
133 /*
134 * Scratchpad to buffer GC data, organized as a ring buffer over
135 * discontiguous folios. scratch_head is where the buffer is filled,
136 * scratch_tail tracks the buffer space freed, and scratch_available
137 * counts the space available in the ring buffer between the head and
138 * the tail.
139 */
140 struct folio *scratch_folios[XFS_GC_NR_BUFS];
141 unsigned int scratch_size;
142 unsigned int scratch_available;
143 unsigned int scratch_head;
144 unsigned int scratch_tail;
145
146 /*
147 * List of bios currently being read, written and reset.
148 * These lists are only accessed by the GC thread itself, and must only
149 * be processed in order.
150 */
151 struct list_head reading;
152 struct list_head writing;
153 struct list_head resetting;
154
155 /*
156 * Iterator for the victim zone.
157 */
158 struct xfs_zone_gc_iter iter;
159 };
160
161 /*
162 * We aim to keep enough zones free in stock to fully use the open zone limit
163 * for data placement purposes. Additionally, the m_zonegc_low_space tunable
164 * can be set to make sure a fraction of the unused blocks are available for
165 * writing.
166 */
167 bool
xfs_zoned_need_gc(struct xfs_mount * mp)168 xfs_zoned_need_gc(
169 struct xfs_mount *mp)
170 {
171 s64 available, free, threshold;
172 s32 remainder;
173
174 if (!xfs_zoned_have_reclaimable(mp->m_zone_info))
175 return false;
176
177 available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE);
178
179 if (available <
180 xfs_rtgs_to_rfsbs(mp, mp->m_max_open_zones - XFS_OPEN_GC_ZONES))
181 return true;
182
183 free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS);
184
185 threshold = div_s64_rem(free, 100, &remainder);
186 threshold = threshold * mp->m_zonegc_low_space +
187 remainder * div_s64(mp->m_zonegc_low_space, 100);
188
189 if (available < threshold)
190 return true;
191
192 return false;
193 }
194
195 static struct xfs_zone_gc_data *
xfs_zone_gc_data_alloc(struct xfs_mount * mp)196 xfs_zone_gc_data_alloc(
197 struct xfs_mount *mp)
198 {
199 struct xfs_zone_gc_data *data;
200 int i;
201
202 data = kzalloc_obj(*data);
203 if (!data)
204 return NULL;
205 data->iter.recs = kzalloc_objs(*data->iter.recs, XFS_ZONE_GC_RECS);
206 if (!data->iter.recs)
207 goto out_free_data;
208
209 if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio),
210 BIOSET_NEED_BVECS))
211 goto out_free_recs;
212 for (i = 0; i < XFS_GC_NR_BUFS; i++) {
213 data->scratch_folios[i] =
214 folio_alloc(GFP_KERNEL, get_order(XFS_GC_BUF_SIZE));
215 if (!data->scratch_folios[i])
216 goto out_free_scratch;
217 }
218 data->scratch_size = XFS_GC_BUF_SIZE * XFS_GC_NR_BUFS;
219 data->scratch_available = data->scratch_size;
220 INIT_LIST_HEAD(&data->reading);
221 INIT_LIST_HEAD(&data->writing);
222 INIT_LIST_HEAD(&data->resetting);
223 data->mp = mp;
224 return data;
225
226 out_free_scratch:
227 while (--i >= 0)
228 folio_put(data->scratch_folios[i]);
229 bioset_exit(&data->bio_set);
230 out_free_recs:
231 kfree(data->iter.recs);
232 out_free_data:
233 kfree(data);
234 return NULL;
235 }
236
237 static void
xfs_zone_gc_data_free(struct xfs_zone_gc_data * data)238 xfs_zone_gc_data_free(
239 struct xfs_zone_gc_data *data)
240 {
241 int i;
242
243 for (i = 0; i < XFS_GC_NR_BUFS; i++)
244 folio_put(data->scratch_folios[i]);
245 bioset_exit(&data->bio_set);
246 kfree(data->iter.recs);
247 kfree(data);
248 }
249
250 static void
xfs_zone_gc_iter_init(struct xfs_zone_gc_iter * iter,struct xfs_rtgroup * victim_rtg)251 xfs_zone_gc_iter_init(
252 struct xfs_zone_gc_iter *iter,
253 struct xfs_rtgroup *victim_rtg)
254
255 {
256 iter->next_startblock = 0;
257 iter->rec_count = 0;
258 iter->rec_idx = 0;
259 iter->victim_rtg = victim_rtg;
260 atomic_inc(&victim_rtg->rtg_gccount);
261 }
262
263 /*
264 * Query the rmap of the victim zone to gather the records to evacuate.
265 */
266 static int
xfs_zone_gc_query_cb(struct xfs_btree_cur * cur,const struct xfs_rmap_irec * irec,void * private)267 xfs_zone_gc_query_cb(
268 struct xfs_btree_cur *cur,
269 const struct xfs_rmap_irec *irec,
270 void *private)
271 {
272 struct xfs_zone_gc_iter *iter = private;
273
274 ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner));
275 ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner));
276 ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK)));
277
278 iter->recs[iter->rec_count] = *irec;
279 if (++iter->rec_count == XFS_ZONE_GC_RECS) {
280 iter->next_startblock =
281 irec->rm_startblock + irec->rm_blockcount;
282 return 1;
283 }
284 return 0;
285 }
286
287 static int
xfs_zone_gc_rmap_rec_cmp(const void * a,const void * b)288 xfs_zone_gc_rmap_rec_cmp(
289 const void *a,
290 const void *b)
291 {
292 const struct xfs_rmap_irec *reca = a;
293 const struct xfs_rmap_irec *recb = b;
294 int diff;
295
296 diff = cmp_int(reca->rm_owner, recb->rm_owner);
297 if (diff)
298 return diff;
299 return cmp_int(reca->rm_offset, recb->rm_offset);
300 }
301
302 static int
xfs_zone_gc_query(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter)303 xfs_zone_gc_query(
304 struct xfs_mount *mp,
305 struct xfs_zone_gc_iter *iter)
306 {
307 struct xfs_rtgroup *rtg = iter->victim_rtg;
308 struct xfs_rmap_irec ri_low = { };
309 struct xfs_rmap_irec ri_high;
310 struct xfs_btree_cur *cur;
311 struct xfs_trans *tp;
312 int error;
313
314 ASSERT(iter->next_startblock <= rtg_blocks(rtg));
315 if (iter->next_startblock == rtg_blocks(rtg))
316 goto done;
317
318 ASSERT(iter->next_startblock < rtg_blocks(rtg));
319 ri_low.rm_startblock = iter->next_startblock;
320 memset(&ri_high, 0xFF, sizeof(ri_high));
321
322 iter->rec_idx = 0;
323 iter->rec_count = 0;
324
325 tp = xfs_trans_alloc_empty(mp);
326 xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP);
327 cur = xfs_rtrmapbt_init_cursor(tp, rtg);
328 error = xfs_rmap_query_range(cur, &ri_low, &ri_high,
329 xfs_zone_gc_query_cb, iter);
330 xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP);
331 xfs_btree_del_cursor(cur, error < 0 ? error : 0);
332 xfs_trans_cancel(tp);
333
334 if (error < 0)
335 return error;
336
337 /*
338 * Sort the rmap records by inode number and increasing offset to
339 * defragment the mappings.
340 *
341 * This could be further enhanced by an even bigger look ahead window,
342 * but that's better left until we have better detection of changes to
343 * inode mapping to avoid the potential of GCing already dead data.
344 */
345 sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]),
346 xfs_zone_gc_rmap_rec_cmp, NULL);
347
348 if (error == 0) {
349 /*
350 * We finished iterating through the zone.
351 */
352 iter->next_startblock = rtg_blocks(rtg);
353 if (iter->rec_count == 0)
354 goto done;
355 }
356
357 return 0;
358 done:
359 atomic_dec(&iter->victim_rtg->rtg_gccount);
360 xfs_rtgroup_rele(iter->victim_rtg);
361 iter->victim_rtg = NULL;
362 return 0;
363 }
364
365 static bool
xfs_zone_gc_iter_next(struct xfs_mount * mp,struct xfs_zone_gc_iter * iter,struct xfs_rmap_irec * chunk_rec,struct xfs_inode ** ipp)366 xfs_zone_gc_iter_next(
367 struct xfs_mount *mp,
368 struct xfs_zone_gc_iter *iter,
369 struct xfs_rmap_irec *chunk_rec,
370 struct xfs_inode **ipp)
371 {
372 struct xfs_rmap_irec *irec;
373 int error;
374
375 if (!iter->victim_rtg)
376 return false;
377
378 retry:
379 if (iter->rec_idx == iter->rec_count) {
380 error = xfs_zone_gc_query(mp, iter);
381 if (error)
382 goto fail;
383 if (!iter->victim_rtg)
384 return false;
385 }
386
387 irec = &iter->recs[iter->rec_idx];
388 error = xfs_iget(mp, NULL, irec->rm_owner,
389 XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp);
390 if (error) {
391 /*
392 * If the inode was already deleted, skip over it.
393 */
394 if (error == -ENOENT) {
395 iter->rec_idx++;
396 goto retry;
397 }
398 goto fail;
399 }
400
401 if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) {
402 iter->rec_idx++;
403 xfs_irele(*ipp);
404 goto retry;
405 }
406
407 *chunk_rec = *irec;
408 return true;
409
410 fail:
411 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
412 return false;
413 }
414
415 static void
xfs_zone_gc_iter_advance(struct xfs_zone_gc_iter * iter,xfs_extlen_t count_fsb)416 xfs_zone_gc_iter_advance(
417 struct xfs_zone_gc_iter *iter,
418 xfs_extlen_t count_fsb)
419 {
420 struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx];
421
422 irec->rm_offset += count_fsb;
423 irec->rm_startblock += count_fsb;
424 irec->rm_blockcount -= count_fsb;
425 if (!irec->rm_blockcount)
426 iter->rec_idx++;
427 }
428
429 static struct xfs_rtgroup *
xfs_zone_gc_pick_victim_from(struct xfs_mount * mp,uint32_t bucket)430 xfs_zone_gc_pick_victim_from(
431 struct xfs_mount *mp,
432 uint32_t bucket)
433 {
434 struct xfs_zone_info *zi = mp->m_zone_info;
435 uint32_t victim_used = U32_MAX;
436 struct xfs_rtgroup *victim_rtg = NULL;
437 uint32_t bit;
438
439 if (!zi->zi_used_bucket_entries[bucket])
440 return NULL;
441
442 for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket],
443 mp->m_sb.sb_rgcount) {
444 struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit);
445
446 if (!rtg)
447 continue;
448
449 /*
450 * If the zone is already undergoing GC, don't pick it again.
451 *
452 * This prevents us from picking one of the zones for which we
453 * already submitted GC I/O, but for which the remapping hasn't
454 * concluded yet. This won't cause data corruption, but
455 * increases write amplification and slows down GC, so this is
456 * a bad thing.
457 */
458 if (atomic_read(&rtg->rtg_gccount)) {
459 xfs_rtgroup_rele(rtg);
460 continue;
461 }
462
463 /* skip zones that are just waiting for a reset */
464 if (rtg_rmap(rtg)->i_used_blocks == 0 ||
465 rtg_rmap(rtg)->i_used_blocks >= victim_used) {
466 xfs_rtgroup_rele(rtg);
467 continue;
468 }
469
470 if (victim_rtg)
471 xfs_rtgroup_rele(victim_rtg);
472 victim_rtg = rtg;
473 victim_used = rtg_rmap(rtg)->i_used_blocks;
474
475 /*
476 * Any zone that is less than 1 percent used is fair game for
477 * instant reclaim. All of these zones are in the last
478 * bucket, so avoid the expensive division for the zones
479 * in the other buckets.
480 */
481 if (bucket == 0 &&
482 rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100)
483 break;
484 }
485
486 return victim_rtg;
487 }
488
489 /*
490 * Iterate through all zones marked as reclaimable and find a candidate to
491 * reclaim.
492 */
493 static bool
xfs_zone_gc_select_victim(struct xfs_zone_gc_data * data)494 xfs_zone_gc_select_victim(
495 struct xfs_zone_gc_data *data)
496 {
497 struct xfs_zone_gc_iter *iter = &data->iter;
498 struct xfs_mount *mp = data->mp;
499 struct xfs_zone_info *zi = mp->m_zone_info;
500 struct xfs_rtgroup *victim_rtg = NULL;
501 unsigned int bucket;
502
503 spin_lock(&zi->zi_used_buckets_lock);
504 for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) {
505 victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket);
506 if (victim_rtg)
507 break;
508 }
509 spin_unlock(&zi->zi_used_buckets_lock);
510
511 if (!victim_rtg)
512 return false;
513
514 trace_xfs_zone_gc_select_victim(victim_rtg, bucket);
515 xfs_zone_gc_iter_init(iter, victim_rtg);
516 return true;
517 }
518
519 static struct xfs_open_zone *
xfs_zone_gc_steal_open(struct xfs_zone_info * zi)520 xfs_zone_gc_steal_open(
521 struct xfs_zone_info *zi)
522 {
523 struct xfs_open_zone *oz, *found = NULL;
524
525 spin_lock(&zi->zi_open_zones_lock);
526 list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) {
527 if (!found || oz->oz_allocated < found->oz_allocated)
528 found = oz;
529 }
530
531 if (found) {
532 found->oz_is_gc = true;
533 list_del_init(&found->oz_entry);
534 zi->zi_nr_open_zones--;
535 }
536
537 spin_unlock(&zi->zi_open_zones_lock);
538 return found;
539 }
540
541 static struct xfs_open_zone *
xfs_zone_gc_select_target(struct xfs_mount * mp)542 xfs_zone_gc_select_target(
543 struct xfs_mount *mp)
544 {
545 struct xfs_zone_info *zi = mp->m_zone_info;
546 struct xfs_open_zone *oz = zi->zi_open_gc_zone;
547
548 /*
549 * We need to wait for pending writes to finish.
550 */
551 if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg))
552 return NULL;
553
554 ASSERT(zi->zi_nr_open_zones <=
555 mp->m_max_open_zones - XFS_OPEN_GC_ZONES);
556 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
557 if (oz)
558 trace_xfs_zone_gc_target_opened(oz->oz_rtg);
559 spin_lock(&zi->zi_open_zones_lock);
560 zi->zi_open_gc_zone = oz;
561 spin_unlock(&zi->zi_open_zones_lock);
562 return oz;
563 }
564
565 /*
566 * Ensure we have a valid open zone to write the GC data to.
567 *
568 * If the current target zone has space keep writing to it, else first wait for
569 * all pending writes and then pick a new one.
570 */
571 static struct xfs_open_zone *
xfs_zone_gc_ensure_target(struct xfs_mount * mp)572 xfs_zone_gc_ensure_target(
573 struct xfs_mount *mp)
574 {
575 struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone;
576
577 if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
578 return xfs_zone_gc_select_target(mp);
579 return oz;
580 }
581
582 static void
xfs_zone_gc_end_io(struct bio * bio)583 xfs_zone_gc_end_io(
584 struct bio *bio)
585 {
586 struct xfs_gc_bio *chunk =
587 container_of(bio, struct xfs_gc_bio, bio);
588 struct xfs_zone_gc_data *data = chunk->data;
589
590 WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE);
591 wake_up_process(data->mp->m_zone_info->zi_gc_thread);
592 }
593
594 static struct xfs_open_zone *
xfs_zone_gc_alloc_blocks(struct xfs_zone_gc_data * data,xfs_extlen_t * count_fsb,xfs_daddr_t * daddr,bool * is_seq)595 xfs_zone_gc_alloc_blocks(
596 struct xfs_zone_gc_data *data,
597 xfs_extlen_t *count_fsb,
598 xfs_daddr_t *daddr,
599 bool *is_seq)
600 {
601 struct xfs_mount *mp = data->mp;
602 struct xfs_open_zone *oz;
603
604 oz = xfs_zone_gc_ensure_target(mp);
605 if (!oz)
606 return NULL;
607
608 *count_fsb = min(*count_fsb, XFS_B_TO_FSB(mp, data->scratch_available));
609
610 /*
611 * Directly allocate GC blocks from the reserved pool.
612 *
613 * If we'd take them from the normal pool we could be stealing blocks
614 * from a regular writer, which would then have to wait for GC and
615 * deadlock.
616 */
617 spin_lock(&mp->m_sb_lock);
618 *count_fsb = min(*count_fsb,
619 rtg_blocks(oz->oz_rtg) - oz->oz_allocated);
620 *count_fsb = min3(*count_fsb,
621 mp->m_free[XC_FREE_RTEXTENTS].res_avail,
622 mp->m_free[XC_FREE_RTAVAILABLE].res_avail);
623 mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb;
624 mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb;
625 spin_unlock(&mp->m_sb_lock);
626
627 if (!*count_fsb)
628 return NULL;
629
630 *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0);
631 *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr);
632 if (!*is_seq)
633 *daddr += XFS_FSB_TO_BB(mp, oz->oz_allocated);
634 oz->oz_allocated += *count_fsb;
635 atomic_inc(&oz->oz_ref);
636 return oz;
637 }
638
639 static void
xfs_zone_gc_add_data(struct xfs_gc_bio * chunk)640 xfs_zone_gc_add_data(
641 struct xfs_gc_bio *chunk)
642 {
643 struct xfs_zone_gc_data *data = chunk->data;
644 unsigned int len = chunk->len;
645 unsigned int off = data->scratch_head;
646
647 do {
648 unsigned int this_off = off % XFS_GC_BUF_SIZE;
649 unsigned int this_len = min(len, XFS_GC_BUF_SIZE - this_off);
650
651 bio_add_folio_nofail(&chunk->bio,
652 data->scratch_folios[off / XFS_GC_BUF_SIZE],
653 this_len, this_off);
654 len -= this_len;
655 off += this_len;
656 if (off == data->scratch_size)
657 off = 0;
658 } while (len);
659 }
660
661 static bool
xfs_zone_gc_start_chunk(struct xfs_zone_gc_data * data)662 xfs_zone_gc_start_chunk(
663 struct xfs_zone_gc_data *data)
664 {
665 struct xfs_zone_gc_iter *iter = &data->iter;
666 struct xfs_mount *mp = data->mp;
667 struct block_device *bdev = mp->m_rtdev_targp->bt_bdev;
668 struct xfs_open_zone *oz;
669 struct xfs_rmap_irec irec;
670 struct xfs_gc_bio *chunk;
671 struct xfs_inode *ip;
672 struct bio *bio;
673 xfs_daddr_t daddr;
674 unsigned int len;
675 bool is_seq;
676
677 if (xfs_is_shutdown(mp))
678 return false;
679
680 if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip))
681 return false;
682 oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr,
683 &is_seq);
684 if (!oz) {
685 xfs_irele(ip);
686 return false;
687 }
688
689 len = XFS_FSB_TO_B(mp, irec.rm_blockcount);
690 bio = bio_alloc_bioset(bdev,
691 min(howmany(len, XFS_GC_BUF_SIZE) + 1, XFS_GC_NR_BUFS),
692 REQ_OP_READ, GFP_NOFS, &data->bio_set);
693
694 chunk = container_of(bio, struct xfs_gc_bio, bio);
695 chunk->ip = ip;
696 chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset);
697 chunk->len = len;
698 chunk->old_startblock =
699 xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock);
700 chunk->new_daddr = daddr;
701 chunk->is_seq = is_seq;
702 chunk->data = data;
703 chunk->oz = oz;
704 chunk->victim_rtg = iter->victim_rtg;
705 atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
706 atomic_inc(&chunk->victim_rtg->rtg_gccount);
707
708 bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock);
709 bio->bi_end_io = xfs_zone_gc_end_io;
710 xfs_zone_gc_add_data(chunk);
711 data->scratch_head = (data->scratch_head + len) % data->scratch_size;
712 data->scratch_available -= len;
713
714 XFS_STATS_INC(mp, xs_gc_read_calls);
715
716 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
717 list_add_tail(&chunk->entry, &data->reading);
718 xfs_zone_gc_iter_advance(iter, irec.rm_blockcount);
719
720 submit_bio(bio);
721 return true;
722 }
723
724 static void
xfs_zone_gc_free_chunk(struct xfs_gc_bio * chunk)725 xfs_zone_gc_free_chunk(
726 struct xfs_gc_bio *chunk)
727 {
728 atomic_dec(&chunk->victim_rtg->rtg_gccount);
729 xfs_rtgroup_rele(chunk->victim_rtg);
730 list_del(&chunk->entry);
731 xfs_open_zone_put(chunk->oz);
732 xfs_irele(chunk->ip);
733 bio_put(&chunk->bio);
734 }
735
736 static void
xfs_zone_gc_submit_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)737 xfs_zone_gc_submit_write(
738 struct xfs_zone_gc_data *data,
739 struct xfs_gc_bio *chunk)
740 {
741 if (chunk->is_seq) {
742 chunk->bio.bi_opf &= ~REQ_OP_WRITE;
743 chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND;
744 }
745 chunk->bio.bi_iter.bi_sector = chunk->new_daddr;
746 chunk->bio.bi_end_io = xfs_zone_gc_end_io;
747 submit_bio(&chunk->bio);
748 }
749
750 static struct xfs_gc_bio *
xfs_zone_gc_split_write(struct xfs_zone_gc_data * data,struct xfs_gc_bio * chunk)751 xfs_zone_gc_split_write(
752 struct xfs_zone_gc_data *data,
753 struct xfs_gc_bio *chunk)
754 {
755 struct queue_limits *lim =
756 &bdev_get_queue(chunk->bio.bi_bdev)->limits;
757 struct xfs_gc_bio *split_chunk;
758 int split_sectors;
759 unsigned int split_len;
760 struct bio *split;
761 unsigned int nsegs;
762
763 if (!chunk->is_seq)
764 return NULL;
765
766 split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs,
767 lim->max_zone_append_sectors << SECTOR_SHIFT);
768 if (!split_sectors)
769 return NULL;
770
771 /* ensure the split chunk is still block size aligned */
772 split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT,
773 data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT;
774 split_len = split_sectors << SECTOR_SHIFT;
775
776 split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set);
777 split_chunk = container_of(split, struct xfs_gc_bio, bio);
778 split_chunk->data = data;
779 ihold(VFS_I(chunk->ip));
780 split_chunk->ip = chunk->ip;
781 split_chunk->is_seq = chunk->is_seq;
782 split_chunk->scratch = chunk->scratch;
783 split_chunk->offset = chunk->offset;
784 split_chunk->len = split_len;
785 split_chunk->old_startblock = chunk->old_startblock;
786 split_chunk->new_daddr = chunk->new_daddr;
787 split_chunk->oz = chunk->oz;
788 atomic_inc(&chunk->oz->oz_ref);
789
790 split_chunk->victim_rtg = chunk->victim_rtg;
791 atomic_inc(&chunk->victim_rtg->rtg_group.xg_active_ref);
792 atomic_inc(&chunk->victim_rtg->rtg_gccount);
793
794 chunk->offset += split_len;
795 chunk->len -= split_len;
796 chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len);
797
798 /* add right before the original chunk */
799 WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW);
800 list_add_tail(&split_chunk->entry, &chunk->entry);
801 return split_chunk;
802 }
803
804 static void
xfs_zone_gc_write_chunk(struct xfs_gc_bio * chunk)805 xfs_zone_gc_write_chunk(
806 struct xfs_gc_bio *chunk)
807 {
808 struct xfs_zone_gc_data *data = chunk->data;
809 struct xfs_mount *mp = chunk->ip->i_mount;
810 struct xfs_gc_bio *split_chunk;
811
812 if (chunk->bio.bi_status)
813 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
814 if (xfs_is_shutdown(mp)) {
815 xfs_zone_gc_free_chunk(chunk);
816 return;
817 }
818
819 XFS_STATS_INC(mp, xs_gc_write_calls);
820 XFS_STATS_ADD(mp, xs_gc_bytes, chunk->len);
821
822 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
823 list_move_tail(&chunk->entry, &data->writing);
824
825 bio_reuse(&chunk->bio, REQ_OP_WRITE);
826 while ((split_chunk = xfs_zone_gc_split_write(data, chunk)))
827 xfs_zone_gc_submit_write(data, split_chunk);
828 xfs_zone_gc_submit_write(data, chunk);
829 }
830
831 static void
xfs_zone_gc_finish_chunk(struct xfs_gc_bio * chunk)832 xfs_zone_gc_finish_chunk(
833 struct xfs_gc_bio *chunk)
834 {
835 uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
836 struct xfs_zone_gc_data *data = chunk->data;
837 struct xfs_inode *ip = chunk->ip;
838 struct xfs_mount *mp = ip->i_mount;
839 int error;
840
841 if (chunk->bio.bi_status)
842 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
843 if (xfs_is_shutdown(mp)) {
844 xfs_zone_gc_free_chunk(chunk);
845 return;
846 }
847
848 data->scratch_tail =
849 (data->scratch_tail + chunk->len) % data->scratch_size;
850 data->scratch_available += chunk->len;
851
852 /*
853 * Cycle through the iolock and wait for direct I/O and layouts to
854 * ensure no one is reading from the old mapping before it goes away.
855 *
856 * Note that xfs_zoned_end_io() below checks that no other writer raced
857 * with us to update the mapping by checking that the old startblock
858 * didn't change.
859 */
860 xfs_ilock(ip, iolock);
861 error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP);
862 if (!error)
863 inode_dio_wait(VFS_I(ip));
864 xfs_iunlock(ip, iolock);
865 if (error)
866 goto free;
867
868 if (chunk->is_seq)
869 chunk->new_daddr = chunk->bio.bi_iter.bi_sector;
870 error = xfs_zoned_end_io(ip, chunk->offset, chunk->len,
871 chunk->new_daddr, chunk->oz, chunk->old_startblock);
872 free:
873 if (error)
874 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
875 xfs_zone_gc_free_chunk(chunk);
876 }
877
878 static void
xfs_zone_gc_finish_reset(struct xfs_gc_bio * chunk)879 xfs_zone_gc_finish_reset(
880 struct xfs_gc_bio *chunk)
881 {
882 struct xfs_rtgroup *rtg = chunk->bio.bi_private;
883 struct xfs_mount *mp = rtg_mount(rtg);
884 struct xfs_zone_info *zi = mp->m_zone_info;
885
886 if (chunk->bio.bi_status) {
887 xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR);
888 goto out;
889 }
890
891 xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE);
892 atomic_inc(&zi->zi_nr_free_zones);
893
894 xfs_zoned_add_available(mp, rtg_blocks(rtg));
895
896 wake_up_all(&zi->zi_zone_wait);
897 out:
898 list_del(&chunk->entry);
899 bio_put(&chunk->bio);
900 }
901
902 static void
xfs_submit_zone_reset_bio(struct xfs_rtgroup * rtg,struct bio * bio)903 xfs_submit_zone_reset_bio(
904 struct xfs_rtgroup *rtg,
905 struct bio *bio)
906 {
907 struct xfs_mount *mp = rtg_mount(rtg);
908
909 trace_xfs_zone_reset(rtg);
910
911 ASSERT(rtg_rmap(rtg)->i_used_blocks == 0);
912
913 if (XFS_TEST_ERROR(mp, XFS_ERRTAG_ZONE_RESET)) {
914 bio_io_error(bio);
915 return;
916 }
917
918 XFS_STATS_INC(mp, xs_gc_zone_reset_calls);
919
920 bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0);
921 if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) {
922 /*
923 * Also use the bio to drive the state machine when neither
924 * zone reset nor discard is supported to keep things simple.
925 */
926 if (!bdev_max_discard_sectors(bio->bi_bdev)) {
927 bio_endio(bio);
928 return;
929 }
930 bio->bi_opf &= ~REQ_OP_ZONE_RESET;
931 bio->bi_opf |= REQ_OP_DISCARD;
932 bio->bi_iter.bi_size = XFS_FSB_TO_B(mp, rtg_blocks(rtg));
933 }
934
935 submit_bio(bio);
936 }
937
xfs_bio_wait_endio(struct bio * bio)938 static void xfs_bio_wait_endio(struct bio *bio)
939 {
940 complete(bio->bi_private);
941 }
942
943 int
xfs_zone_gc_reset_sync(struct xfs_rtgroup * rtg)944 xfs_zone_gc_reset_sync(
945 struct xfs_rtgroup *rtg)
946 {
947 DECLARE_COMPLETION_ONSTACK(done);
948 struct bio bio;
949 int error;
950
951 bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0,
952 REQ_OP_ZONE_RESET | REQ_SYNC);
953 bio.bi_private = &done;
954 bio.bi_end_io = xfs_bio_wait_endio;
955 xfs_submit_zone_reset_bio(rtg, &bio);
956 wait_for_completion_io(&done);
957
958 error = blk_status_to_errno(bio.bi_status);
959 bio_uninit(&bio);
960 return error;
961 }
962
963 static void
xfs_zone_gc_reset_zones(struct xfs_zone_gc_data * data,struct xfs_group * reset_list)964 xfs_zone_gc_reset_zones(
965 struct xfs_zone_gc_data *data,
966 struct xfs_group *reset_list)
967 {
968 struct xfs_group *next = reset_list;
969
970 if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) {
971 xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR);
972 return;
973 }
974
975 do {
976 struct xfs_rtgroup *rtg = to_rtg(next);
977 struct xfs_gc_bio *chunk;
978 struct bio *bio;
979
980 xfs_log_force_inode(rtg_rmap(rtg));
981
982 next = rtg_group(rtg)->xg_next_reset;
983 rtg_group(rtg)->xg_next_reset = NULL;
984
985 bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev,
986 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set);
987 bio->bi_private = rtg;
988 bio->bi_end_io = xfs_zone_gc_end_io;
989
990 chunk = container_of(bio, struct xfs_gc_bio, bio);
991 chunk->data = data;
992 WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW);
993 list_add_tail(&chunk->entry, &data->resetting);
994 xfs_submit_zone_reset_bio(rtg, bio);
995 } while (next);
996 }
997
998 static bool
xfs_zone_gc_should_start_new_work(struct xfs_zone_gc_data * data)999 xfs_zone_gc_should_start_new_work(
1000 struct xfs_zone_gc_data *data)
1001 {
1002 struct xfs_open_zone *oz;
1003
1004 if (xfs_is_shutdown(data->mp))
1005 return false;
1006 if (!data->scratch_available)
1007 return false;
1008
1009 oz = xfs_zone_gc_ensure_target(data->mp);
1010 if (!oz || oz->oz_allocated == rtg_blocks(oz->oz_rtg))
1011 return false;
1012
1013 if (!data->iter.victim_rtg) {
1014 if (kthread_should_stop() || kthread_should_park())
1015 return false;
1016 if (!xfs_zoned_need_gc(data->mp))
1017 return false;
1018 if (!xfs_zone_gc_select_victim(data))
1019 return false;
1020 }
1021
1022 return true;
1023 }
1024
1025 /*
1026 * Handle the work to read and write data for GC and to reset the zones,
1027 * including handling all completions.
1028 *
1029 * Note that the order of the chunks is preserved so that we don't undo the
1030 * optimal order established by xfs_zone_gc_query().
1031 */
1032 static void
xfs_zone_gc_handle_work(struct xfs_zone_gc_data * data)1033 xfs_zone_gc_handle_work(
1034 struct xfs_zone_gc_data *data)
1035 {
1036 struct xfs_zone_info *zi = data->mp->m_zone_info;
1037 struct xfs_gc_bio *chunk, *next;
1038 struct xfs_group *reset_list;
1039 struct blk_plug plug;
1040
1041 spin_lock(&zi->zi_reset_list_lock);
1042 reset_list = zi->zi_reset_list;
1043 zi->zi_reset_list = NULL;
1044 spin_unlock(&zi->zi_reset_list_lock);
1045
1046 if (reset_list) {
1047 set_current_state(TASK_RUNNING);
1048 xfs_zone_gc_reset_zones(data, reset_list);
1049 }
1050
1051 list_for_each_entry_safe(chunk, next, &data->resetting, entry) {
1052 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1053 break;
1054 set_current_state(TASK_RUNNING);
1055 xfs_zone_gc_finish_reset(chunk);
1056 }
1057
1058 list_for_each_entry_safe(chunk, next, &data->writing, entry) {
1059 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1060 break;
1061 set_current_state(TASK_RUNNING);
1062 xfs_zone_gc_finish_chunk(chunk);
1063 }
1064
1065 blk_start_plug(&plug);
1066 list_for_each_entry_safe(chunk, next, &data->reading, entry) {
1067 if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE)
1068 break;
1069 set_current_state(TASK_RUNNING);
1070 xfs_zone_gc_write_chunk(chunk);
1071 }
1072 blk_finish_plug(&plug);
1073
1074 if (xfs_zone_gc_should_start_new_work(data)) {
1075 set_current_state(TASK_RUNNING);
1076 blk_start_plug(&plug);
1077 while (xfs_zone_gc_start_chunk(data))
1078 ;
1079 blk_finish_plug(&plug);
1080 }
1081 }
1082
1083 /*
1084 * Note that the current GC algorithm would break reflinks and thus duplicate
1085 * data that was shared by multiple owners before. Because of that reflinks
1086 * are currently not supported on zoned file systems and can't be created or
1087 * mounted.
1088 */
1089 static int
xfs_zoned_gcd(void * private)1090 xfs_zoned_gcd(
1091 void *private)
1092 {
1093 struct xfs_zone_gc_data *data = private;
1094 struct xfs_mount *mp = data->mp;
1095 struct xfs_zone_info *zi = mp->m_zone_info;
1096 unsigned int nofs_flag;
1097
1098 nofs_flag = memalloc_nofs_save();
1099 set_freezable();
1100
1101 for (;;) {
1102 set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE);
1103 xfs_set_zonegc_running(mp);
1104
1105 xfs_zone_gc_handle_work(data);
1106
1107 /*
1108 * Only sleep if nothing set the state to running. Else check for
1109 * work again as someone might have queued up more work and woken
1110 * us in the meantime.
1111 */
1112 if (get_current_state() == TASK_RUNNING) {
1113 try_to_freeze();
1114 continue;
1115 }
1116
1117 if (list_empty(&data->reading) &&
1118 list_empty(&data->writing) &&
1119 list_empty(&data->resetting) &&
1120 !zi->zi_reset_list) {
1121 xfs_clear_zonegc_running(mp);
1122 xfs_zoned_resv_wake_all(mp);
1123
1124 if (kthread_should_stop()) {
1125 __set_current_state(TASK_RUNNING);
1126 break;
1127 }
1128
1129 if (kthread_should_park()) {
1130 __set_current_state(TASK_RUNNING);
1131 kthread_parkme();
1132 continue;
1133 }
1134 }
1135
1136 schedule();
1137 }
1138 xfs_clear_zonegc_running(mp);
1139
1140 if (data->iter.victim_rtg)
1141 xfs_rtgroup_rele(data->iter.victim_rtg);
1142
1143 memalloc_nofs_restore(nofs_flag);
1144 xfs_zone_gc_data_free(data);
1145 return 0;
1146 }
1147
1148 void
xfs_zone_gc_start(struct xfs_mount * mp)1149 xfs_zone_gc_start(
1150 struct xfs_mount *mp)
1151 {
1152 if (xfs_has_zoned(mp))
1153 kthread_unpark(mp->m_zone_info->zi_gc_thread);
1154 }
1155
1156 void
xfs_zone_gc_stop(struct xfs_mount * mp)1157 xfs_zone_gc_stop(
1158 struct xfs_mount *mp)
1159 {
1160 if (xfs_has_zoned(mp))
1161 kthread_park(mp->m_zone_info->zi_gc_thread);
1162 }
1163
1164 int
xfs_zone_gc_mount(struct xfs_mount * mp)1165 xfs_zone_gc_mount(
1166 struct xfs_mount *mp)
1167 {
1168 struct xfs_zone_info *zi = mp->m_zone_info;
1169 struct xfs_zone_gc_data *data;
1170 struct xfs_open_zone *oz;
1171 int error;
1172
1173 /*
1174 * If there are no free zones available for GC, pick the open zone with
1175 * the least used space to GC into. This should only happen after an
1176 * unclean shutdown near ENOSPC while GC was ongoing.
1177 *
1178 * We also need to do this for the first gc zone allocation if we
1179 * unmounted while at the open limit.
1180 */
1181 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) ||
1182 zi->zi_nr_open_zones == mp->m_max_open_zones)
1183 oz = xfs_zone_gc_steal_open(zi);
1184 else
1185 oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true);
1186 if (!oz) {
1187 xfs_warn(mp, "unable to allocate a zone for gc");
1188 error = -EIO;
1189 goto out;
1190 }
1191
1192 trace_xfs_zone_gc_target_opened(oz->oz_rtg);
1193 zi->zi_open_gc_zone = oz;
1194
1195 data = xfs_zone_gc_data_alloc(mp);
1196 if (!data) {
1197 error = -ENOMEM;
1198 goto out_put_gc_zone;
1199 }
1200
1201 zi->zi_gc_thread = kthread_create(xfs_zoned_gcd, data,
1202 "xfs-zone-gc/%s", mp->m_super->s_id);
1203 if (IS_ERR(zi->zi_gc_thread)) {
1204 xfs_warn(mp, "unable to create zone gc thread");
1205 error = PTR_ERR(zi->zi_gc_thread);
1206 goto out_free_gc_data;
1207 }
1208
1209 /* xfs_zone_gc_start will unpark for rw mounts */
1210 kthread_park(zi->zi_gc_thread);
1211 return 0;
1212
1213 out_free_gc_data:
1214 kfree(data);
1215 out_put_gc_zone:
1216 xfs_open_zone_put(zi->zi_open_gc_zone);
1217 out:
1218 return error;
1219 }
1220
1221 void
xfs_zone_gc_unmount(struct xfs_mount * mp)1222 xfs_zone_gc_unmount(
1223 struct xfs_mount *mp)
1224 {
1225 struct xfs_zone_info *zi = mp->m_zone_info;
1226
1227 kthread_stop(zi->zi_gc_thread);
1228 if (zi->zi_open_gc_zone)
1229 xfs_open_zone_put(zi->zi_open_gc_zone);
1230 }
1231