xref: /linux/fs/xfs/xfs_zone_space_resv.c (revision f09fc24dd9a5ec989dfdde7090624924ede6ddc7)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2023-2025 Christoph Hellwig.
4  * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates.
5  */
6 #include "xfs.h"
7 #include "xfs_shared.h"
8 #include "xfs_format.h"
9 #include "xfs_trans_resv.h"
10 #include "xfs_mount.h"
11 #include "xfs_inode.h"
12 #include "xfs_rtbitmap.h"
13 #include "xfs_zone_alloc.h"
14 #include "xfs_zone_priv.h"
15 #include "xfs_zones.h"
16 
17 /*
18  * Note: the zoned allocator does not support a rtextsize > 1, so this code and
19  * the allocator itself uses file system blocks interchangeable with realtime
20  * extents without doing the otherwise required conversions.
21  */
22 
23 /*
24  * Per-task space reservation.
25  *
26  * Tasks that need to wait for GC to free up space allocate one of these
27  * on-stack and adds it to the per-mount zi_reclaim_reservations lists.
28  * The GC thread will then wake the tasks in order when space becomes available.
29  */
30 struct xfs_zone_reservation {
31 	struct list_head	entry;
32 	struct task_struct	*task;
33 	xfs_filblks_t		count_fsb;
34 };
35 
36 /*
37  * Calculate the number of reserved blocks.
38  *
39  * XC_FREE_RTEXTENTS counts the user available capacity, to which the file
40  * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly
41  * available for writes without waiting for GC.
42  *
43  * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and
44  * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS
45  * is further restricted by at least one zone as well as the optional
46  * persistently reserved blocks.  This allows the allocator to run more
47  * smoothly by not always triggering GC.
48  */
49 uint64_t
50 xfs_zoned_default_resblks(
51 	struct xfs_mount	*mp,
52 	enum xfs_free_counter	ctr)
53 {
54 	switch (ctr) {
55 	case XC_FREE_RTEXTENTS:
56 		return (uint64_t)XFS_RESERVED_ZONES *
57 			mp->m_groups[XG_TYPE_RTG].blocks +
58 			mp->m_sb.sb_rtreserved;
59 	case XC_FREE_RTAVAILABLE:
60 		return (uint64_t)XFS_GC_ZONES *
61 			mp->m_groups[XG_TYPE_RTG].blocks;
62 	default:
63 		ASSERT(0);
64 		return 0;
65 	}
66 }
67 
68 void
69 xfs_zoned_resv_wake_all(
70 	struct xfs_mount		*mp)
71 {
72 	struct xfs_zone_info		*zi = mp->m_zone_info;
73 	struct xfs_zone_reservation	*reservation;
74 
75 	spin_lock(&zi->zi_reservation_lock);
76 	list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry)
77 		wake_up_process(reservation->task);
78 	spin_unlock(&zi->zi_reservation_lock);
79 }
80 
81 void
82 xfs_zoned_add_available(
83 	struct xfs_mount		*mp,
84 	xfs_filblks_t			count_fsb)
85 {
86 	struct xfs_zone_info		*zi = mp->m_zone_info;
87 	struct xfs_zone_reservation	*reservation;
88 
89 	if (list_empty_careful(&zi->zi_reclaim_reservations)) {
90 		xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
91 		return;
92 	}
93 
94 	spin_lock(&zi->zi_reservation_lock);
95 	xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb);
96 	count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE);
97 	list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) {
98 		if (reservation->count_fsb > count_fsb)
99 			break;
100 		wake_up_process(reservation->task);
101 		count_fsb -= reservation->count_fsb;
102 
103 	}
104 	spin_unlock(&zi->zi_reservation_lock);
105 }
106 
107 static int
108 xfs_zoned_space_wait_error(
109 	struct xfs_mount		*mp)
110 {
111 	if (xfs_is_shutdown(mp))
112 		return -EIO;
113 	if (fatal_signal_pending(current))
114 		return -EINTR;
115 	return 0;
116 }
117 
118 static int
119 xfs_zoned_reserve_available(
120 	struct xfs_mount		*mp,
121 	xfs_filblks_t			count_fsb,
122 	unsigned int			flags)
123 {
124 	struct xfs_zone_info		*zi = mp->m_zone_info;
125 	struct xfs_zone_reservation	reservation = {
126 		.task		= current,
127 		.count_fsb	= count_fsb,
128 	};
129 	int				error;
130 
131 	/*
132 	 * If there are no waiters, try to directly grab the available blocks
133 	 * from the percpu counter.
134 	 *
135 	 * If the caller wants to dip into the reserved pool also bypass the
136 	 * wait list.  This relies on the fact that we have a very graciously
137 	 * sized reserved pool that always has enough space.  If the reserved
138 	 * allocations fail we're in trouble.
139 	 */
140 	if (likely(list_empty_careful(&zi->zi_reclaim_reservations) ||
141 	    (flags & XFS_ZR_RESERVED))) {
142 		error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
143 				flags & XFS_ZR_RESERVED);
144 		if (error != -ENOSPC)
145 			return error;
146 	}
147 
148 	if (flags & XFS_ZR_NOWAIT)
149 		return -EAGAIN;
150 
151 	spin_lock(&zi->zi_reservation_lock);
152 	list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations);
153 	while ((error = xfs_zoned_space_wait_error(mp)) == 0) {
154 		set_current_state(TASK_KILLABLE);
155 
156 		error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb,
157 				flags & XFS_ZR_RESERVED);
158 		if (error != -ENOSPC)
159 			break;
160 
161 		/*
162 		 * Make sure to start GC if it is not running already. As we
163 		 * check the rtavailable count when filling up zones, GC is
164 		 * normally already running at this point, but in some setups
165 		 * with very few zones we may completely run out of non-
166 		 * reserved blocks in between filling zones.
167 		 */
168 		if (!xfs_is_zonegc_running(mp))
169 			wake_up_process(zi->zi_gc_thread);
170 
171 		/*
172 		 * If there is no reclaimable group left and we aren't still
173 		 * processing a pending GC request give up as we're fully out
174 		 * of space.
175 		 */
176 		if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) &&
177 		    !xfs_is_zonegc_running(mp))
178 			break;
179 
180 		spin_unlock(&zi->zi_reservation_lock);
181 		schedule();
182 		spin_lock(&zi->zi_reservation_lock);
183 	}
184 	list_del(&reservation.entry);
185 	spin_unlock(&zi->zi_reservation_lock);
186 
187 	__set_current_state(TASK_RUNNING);
188 	return error;
189 }
190 
191 /*
192  * Implement greedy space allocation for short writes by trying to grab all
193  * that is left after locking out other threads from trying to do the same.
194  *
195  * This isn't exactly optimal and can hopefully be replaced by a proper
196  * percpu_counter primitive one day.
197  */
198 static int
199 xfs_zoned_reserve_extents_greedy(
200 	struct xfs_mount		*mp,
201 	xfs_filblks_t			*count_fsb,
202 	unsigned int			flags)
203 {
204 	struct xfs_zone_info		*zi = mp->m_zone_info;
205 	s64				len = *count_fsb;
206 	int				error = -ENOSPC;
207 
208 	spin_lock(&zi->zi_reservation_lock);
209 	len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS));
210 	if (len > 0) {
211 		*count_fsb = len;
212 		error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb,
213 				flags & XFS_ZR_RESERVED);
214 	}
215 	spin_unlock(&zi->zi_reservation_lock);
216 	return error;
217 }
218 
219 int
220 xfs_zoned_space_reserve(
221 	struct xfs_mount		*mp,
222 	xfs_filblks_t			count_fsb,
223 	unsigned int			flags,
224 	struct xfs_zone_alloc_ctx	*ac)
225 {
226 	int				error;
227 
228 	ASSERT(ac->reserved_blocks == 0);
229 	ASSERT(ac->open_zone == NULL);
230 
231 	error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb,
232 			flags & XFS_ZR_RESERVED);
233 	if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1)
234 		error = xfs_zoned_reserve_extents_greedy(mp, &count_fsb, flags);
235 	if (error)
236 		return error;
237 
238 	error = xfs_zoned_reserve_available(mp, count_fsb, flags);
239 	if (error) {
240 		xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb);
241 		return error;
242 	}
243 	ac->reserved_blocks = count_fsb;
244 	return 0;
245 }
246 
247 void
248 xfs_zoned_space_unreserve(
249 	struct xfs_mount		*mp,
250 	struct xfs_zone_alloc_ctx	*ac)
251 {
252 	if (ac->reserved_blocks > 0) {
253 		xfs_zoned_add_available(mp, ac->reserved_blocks);
254 		xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks);
255 	}
256 	if (ac->open_zone)
257 		xfs_open_zone_put(ac->open_zone);
258 }
259