1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (c) 2023-2025 Christoph Hellwig.
4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5 */
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_trans_resv.h"
10 #include "xfs_mount.h"
11 #include "xfs_inode.h"
12 #include "xfs_rtbitmap.h"
13 #include "xfs_zone_alloc.h"
14 #include "xfs_zone_priv.h"
15 #include "xfs_zones.h"
16
17 /*
18 * Note: the zoned allocator does not support a rtextsize > 1, so this code and
19 * the allocator itself uses file system blocks interchangeable with realtime
20 * extents without doing the otherwise required conversions.
21 */
22
23 /*
24 * Per-task space reservation.
25 *
26 * Tasks that need to wait for GC to free up space allocate one of these
27 * on-stack and adds it to the per-mount zi_reclaim_reservations lists.
28 * The GC thread will then wake the tasks in order when space becomes available.
29 */
30 struct xfs_zone_reservation {
31 struct list_head entry;
32 struct task_struct *task;
33 xfs_filblks_t count_fsb;
34 };
35
36 /*
37 * Calculate the number of reserved blocks.
38 *
39 * XC_FREE_RTEXTENTS counts the user available capacity, to which the file
40 * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
41 * available for writes without waiting for GC.
42 *
43 * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
44 * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
45 * is further restricted by at least one zone as well as the optional
46 * persistently reserved blocks. This allows the allocator to run more
47 * smoothly by not always triggering GC.
48 */
49 uint64_t
xfs_zoned_default_resblks(struct xfs_mount * mp,enum xfs_free_counter ctr)50 xfs_zoned_default_resblks(
51 struct xfs_mount *mp,
52 enum xfs_free_counter ctr)
53 {
54 switch (ctr) {
55 case XC_FREE_RTEXTENTS:
56 return (uint64_t)XFS_RESERVED_ZONES *
57 mp->m_groups[XG_TYPE_RTG].blocks +
58 mp->m_sb.sb_rtreserved;
59 case XC_FREE_RTAVAILABLE:
60 return (uint64_t)XFS_GC_ZONES *
61 mp->m_groups[XG_TYPE_RTG].blocks;
62 default:
63 ASSERT(0);
64 return 0;
65 }
66 }
67
68 void
xfs_zoned_resv_wake_all(struct xfs_mount * mp)69 xfs_zoned_resv_wake_all(
70 struct xfs_mount *mp)
71 {
72 struct xfs_zone_info *zi = mp->m_zone_info;
73 struct xfs_zone_reservation *reservation;
74
75 spin_lock(&zi->zi_reservation_lock);
76 list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
77 wake_up_process(reservation->task);
78 spin_unlock(&zi->zi_reservation_lock);
79 }
80
81 void
xfs_zoned_add_available(struct xfs_mount * mp,xfs_filblks_t count_fsb)82 xfs_zoned_add_available(
83 struct xfs_mount *mp,
84 xfs_filblks_t count_fsb)
85 {
86 struct xfs_zone_info *zi = mp->m_zone_info;
87 struct xfs_zone_reservation *reservation;
88
89 if (list_empty_careful(&zi->zi_reclaim_reservations)) {
90 xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
91 return;
92 }
93
94 spin_lock(&zi->zi_reservation_lock);
95 xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
96 count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
97 list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
98 if (reservation->count_fsb > count_fsb)
99 break;
100 wake_up_process(reservation->task);
101 count_fsb -= reservation->count_fsb;
102
103 }
104 spin_unlock(&zi->zi_reservation_lock);
105 }
106
107 static int
xfs_zoned_space_wait_error(struct xfs_mount * mp)108 xfs_zoned_space_wait_error(
109 struct xfs_mount *mp)
110 {
111 if (xfs_is_shutdown(mp))
112 return -EIO;
113 if (fatal_signal_pending(current))
114 return -EINTR;
115 return 0;
116 }
117
118 static int
xfs_zoned_reserve_available(struct xfs_inode * ip,xfs_filblks_t count_fsb,unsigned int flags)119 xfs_zoned_reserve_available(
120 struct xfs_inode *ip,
121 xfs_filblks_t count_fsb,
122 unsigned int flags)
123 {
124 struct xfs_mount *mp = ip->i_mount;
125 struct xfs_zone_info *zi = mp->m_zone_info;
126 struct xfs_zone_reservation reservation = {
127 .task = current,
128 .count_fsb = count_fsb,
129 };
130 int error;
131
132 /*
133 * If there are no waiters, try to directly grab the available blocks
134 * from the percpu counter.
135 *
136 * If the caller wants to dip into the reserved pool also bypass the
137 * wait list. This relies on the fact that we have a very graciously
138 * sized reserved pool that always has enough space. If the reserved
139 * allocations fail we're in trouble.
140 */
141 if (likely(list_empty_careful(&zi->zi_reclaim_reservations) ||
142 (flags & XFS_ZR_RESERVED))) {
143 error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
144 flags & XFS_ZR_RESERVED);
145 if (error != -ENOSPC)
146 return error;
147 }
148
149 if (flags & XFS_ZR_NOWAIT)
150 return -EAGAIN;
151
152 spin_lock(&zi->zi_reservation_lock);
153 list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
154 while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
155 set_current_state(TASK_KILLABLE);
156
157 error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
158 flags & XFS_ZR_RESERVED);
159 if (error != -ENOSPC)
160 break;
161
162 /*
163 * Make sure to start GC if it is not running already. As we
164 * check the rtavailable count when filling up zones, GC is
165 * normally already running at this point, but in some setups
166 * with very few zones we may completely run out of non-
167 * reserved blocks in between filling zones.
168 */
169 if (!xfs_is_zonegc_running(mp))
170 wake_up_process(zi->zi_gc_thread);
171
172 /*
173 * If there is no reclaimable group left and we aren't still
174 * processing a pending GC request give up as we're fully out
175 * of space.
176 */
177 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) &&
178 !xfs_is_zonegc_running(mp))
179 break;
180
181 spin_unlock(&zi->zi_reservation_lock);
182 schedule();
183 spin_lock(&zi->zi_reservation_lock);
184 }
185 list_del(&reservation.entry);
186 spin_unlock(&zi->zi_reservation_lock);
187
188 __set_current_state(TASK_RUNNING);
189 return error;
190 }
191
192 /*
193 * Implement greedy space allocation for short writes by trying to grab all
194 * that is left after locking out other threads from trying to do the same.
195 *
196 * This isn't exactly optimal and can hopefully be replaced by a proper
197 * percpu_counter primitive one day.
198 */
199 static int
xfs_zoned_reserve_extents_greedy(struct xfs_inode * ip,xfs_filblks_t * count_fsb,unsigned int flags)200 xfs_zoned_reserve_extents_greedy(
201 struct xfs_inode *ip,
202 xfs_filblks_t *count_fsb,
203 unsigned int flags)
204 {
205 struct xfs_mount *mp = ip->i_mount;
206 struct xfs_zone_info *zi = mp->m_zone_info;
207 s64 len = *count_fsb;
208 int error = -ENOSPC;
209
210 spin_lock(&zi->zi_reservation_lock);
211 len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
212 if (len > 0) {
213 *count_fsb = len;
214 error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
215 flags & XFS_ZR_RESERVED);
216 }
217 spin_unlock(&zi->zi_reservation_lock);
218 return error;
219 }
220
221 int
xfs_zoned_space_reserve(struct xfs_inode * ip,xfs_filblks_t count_fsb,unsigned int flags,struct xfs_zone_alloc_ctx * ac)222 xfs_zoned_space_reserve(
223 struct xfs_inode *ip,
224 xfs_filblks_t count_fsb,
225 unsigned int flags,
226 struct xfs_zone_alloc_ctx *ac)
227 {
228 struct xfs_mount *mp = ip->i_mount;
229 int error;
230
231 ASSERT(ac->reserved_blocks == 0);
232 ASSERT(ac->open_zone == NULL);
233
234 error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
235 flags & XFS_ZR_RESERVED);
236 if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
237 error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags);
238 if (error)
239 return error;
240
241 error = xfs_zoned_reserve_available(ip, count_fsb, flags);
242 if (error) {
243 xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
244 return error;
245 }
246 ac->reserved_blocks = count_fsb;
247 return 0;
248 }
249
250 void
xfs_zoned_space_unreserve(struct xfs_inode * ip,struct xfs_zone_alloc_ctx * ac)251 xfs_zoned_space_unreserve(
252 struct xfs_inode *ip,
253 struct xfs_zone_alloc_ctx *ac)
254 {
255 if (ac->reserved_blocks > 0) {
256 struct xfs_mount *mp = ip->i_mount;
257
258 xfs_zoned_add_available(mp, ac->reserved_blocks);
259 xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
260 }
261 if (ac->open_zone)
262 xfs_open_zone_put(ac->open_zone);
263 }
264