1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2023-2025 Christoph Hellwig. 4 * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. 5 */ 6 #include "xfs.h" 7 #include "xfs_shared.h" 8 #include "xfs_format.h" 9 #include "xfs_trans_resv.h" 10 #include "xfs_mount.h" 11 #include "xfs_inode.h" 12 #include "xfs_rtbitmap.h" 13 #include "xfs_zone_alloc.h" 14 #include "xfs_zone_priv.h" 15 #include "xfs_zones.h" 16 17 /* 18 * Note: the zoned allocator does not support a rtextsize > 1, so this code and 19 * the allocator itself uses file system blocks interchangeable with realtime 20 * extents without doing the otherwise required conversions. 21 */ 22 23 /* 24 * Per-task space reservation. 25 * 26 * Tasks that need to wait for GC to free up space allocate one of these 27 * on-stack and adds it to the per-mount zi_reclaim_reservations lists. 28 * The GC thread will then wake the tasks in order when space becomes available. 29 */ 30 struct xfs_zone_reservation { 31 struct list_head entry; 32 struct task_struct *task; 33 xfs_filblks_t count_fsb; 34 }; 35 36 /* 37 * Calculate the number of reserved blocks. 38 * 39 * XC_FREE_RTEXTENTS counts the user available capacity, to which the file 40 * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly 41 * available for writes without waiting for GC. 42 * 43 * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and 44 * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS 45 * is further restricted by at least one zone as well as the optional 46 * persistently reserved blocks. This allows the allocator to run more 47 * smoothly by not always triggering GC. 48 */ 49 uint64_t 50 xfs_zoned_default_resblks( 51 struct xfs_mount *mp, 52 enum xfs_free_counter ctr) 53 { 54 switch (ctr) { 55 case XC_FREE_RTEXTENTS: 56 return (uint64_t)XFS_RESERVED_ZONES * 57 mp->m_groups[XG_TYPE_RTG].blocks + 58 mp->m_sb.sb_rtreserved; 59 case XC_FREE_RTAVAILABLE: 60 return (uint64_t)XFS_GC_ZONES * 61 mp->m_groups[XG_TYPE_RTG].blocks; 62 default: 63 ASSERT(0); 64 return 0; 65 } 66 } 67 68 void 69 xfs_zoned_resv_wake_all( 70 struct xfs_mount *mp) 71 { 72 struct xfs_zone_info *zi = mp->m_zone_info; 73 struct xfs_zone_reservation *reservation; 74 75 spin_lock(&zi->zi_reservation_lock); 76 list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) 77 wake_up_process(reservation->task); 78 spin_unlock(&zi->zi_reservation_lock); 79 } 80 81 void 82 xfs_zoned_add_available( 83 struct xfs_mount *mp, 84 xfs_filblks_t count_fsb) 85 { 86 struct xfs_zone_info *zi = mp->m_zone_info; 87 struct xfs_zone_reservation *reservation; 88 89 if (list_empty_careful(&zi->zi_reclaim_reservations)) { 90 xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb); 91 return; 92 } 93 94 spin_lock(&zi->zi_reservation_lock); 95 xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb); 96 count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE); 97 list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) { 98 if (reservation->count_fsb > count_fsb) 99 break; 100 wake_up_process(reservation->task); 101 count_fsb -= reservation->count_fsb; 102 103 } 104 spin_unlock(&zi->zi_reservation_lock); 105 } 106 107 static int 108 xfs_zoned_space_wait_error( 109 struct xfs_mount *mp) 110 { 111 if (xfs_is_shutdown(mp)) 112 return -EIO; 113 if (fatal_signal_pending(current)) 114 return -EINTR; 115 return 0; 116 } 117 118 static int 119 xfs_zoned_reserve_available( 120 struct xfs_inode *ip, 121 xfs_filblks_t count_fsb, 122 unsigned int flags) 123 { 124 struct xfs_mount *mp = ip->i_mount; 125 struct xfs_zone_info *zi = mp->m_zone_info; 126 struct xfs_zone_reservation reservation = { 127 .task = current, 128 .count_fsb = count_fsb, 129 }; 130 int error; 131 132 /* 133 * If there are no waiters, try to directly grab the available blocks 134 * from the percpu counter. 135 * 136 * If the caller wants to dip into the reserved pool also bypass the 137 * wait list. This relies on the fact that we have a very graciously 138 * sized reserved pool that always has enough space. If the reserved 139 * allocations fail we're in trouble. 140 */ 141 if (likely(list_empty_careful(&zi->zi_reclaim_reservations) || 142 (flags & XFS_ZR_RESERVED))) { 143 error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb, 144 flags & XFS_ZR_RESERVED); 145 if (error != -ENOSPC) 146 return error; 147 } 148 149 if (flags & XFS_ZR_NOWAIT) 150 return -EAGAIN; 151 152 spin_lock(&zi->zi_reservation_lock); 153 list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations); 154 while ((error = xfs_zoned_space_wait_error(mp)) == 0) { 155 set_current_state(TASK_KILLABLE); 156 157 error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb, 158 flags & XFS_ZR_RESERVED); 159 if (error != -ENOSPC) 160 break; 161 162 /* 163 * Make sure to start GC if it is not running already. As we 164 * check the rtavailable count when filling up zones, GC is 165 * normally already running at this point, but in some setups 166 * with very few zones we may completely run out of non- 167 * reserved blocks in between filling zones. 168 */ 169 if (!xfs_is_zonegc_running(mp)) 170 wake_up_process(zi->zi_gc_thread); 171 172 /* 173 * If there is no reclaimable group left and we aren't still 174 * processing a pending GC request give up as we're fully out 175 * of space. 176 */ 177 if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) && 178 !xfs_is_zonegc_running(mp)) 179 break; 180 181 spin_unlock(&zi->zi_reservation_lock); 182 schedule(); 183 spin_lock(&zi->zi_reservation_lock); 184 } 185 list_del(&reservation.entry); 186 spin_unlock(&zi->zi_reservation_lock); 187 188 __set_current_state(TASK_RUNNING); 189 return error; 190 } 191 192 /* 193 * Implement greedy space allocation for short writes by trying to grab all 194 * that is left after locking out other threads from trying to do the same. 195 * 196 * This isn't exactly optimal and can hopefully be replaced by a proper 197 * percpu_counter primitive one day. 198 */ 199 static int 200 xfs_zoned_reserve_extents_greedy( 201 struct xfs_inode *ip, 202 xfs_filblks_t *count_fsb, 203 unsigned int flags) 204 { 205 struct xfs_mount *mp = ip->i_mount; 206 struct xfs_zone_info *zi = mp->m_zone_info; 207 s64 len = *count_fsb; 208 int error = -ENOSPC; 209 210 spin_lock(&zi->zi_reservation_lock); 211 len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); 212 if (len > 0) { 213 *count_fsb = len; 214 error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb, 215 flags & XFS_ZR_RESERVED); 216 } 217 spin_unlock(&zi->zi_reservation_lock); 218 return error; 219 } 220 221 int 222 xfs_zoned_space_reserve( 223 struct xfs_inode *ip, 224 xfs_filblks_t count_fsb, 225 unsigned int flags, 226 struct xfs_zone_alloc_ctx *ac) 227 { 228 struct xfs_mount *mp = ip->i_mount; 229 int error; 230 231 ASSERT(ac->reserved_blocks == 0); 232 ASSERT(ac->open_zone == NULL); 233 234 error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb, 235 flags & XFS_ZR_RESERVED); 236 if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1) 237 error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags); 238 if (error) 239 return error; 240 241 error = xfs_zoned_reserve_available(ip, count_fsb, flags); 242 if (error) { 243 xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb); 244 return error; 245 } 246 ac->reserved_blocks = count_fsb; 247 return 0; 248 } 249 250 void 251 xfs_zoned_space_unreserve( 252 struct xfs_inode *ip, 253 struct xfs_zone_alloc_ctx *ac) 254 { 255 if (ac->reserved_blocks > 0) { 256 struct xfs_mount *mp = ip->i_mount; 257 258 xfs_zoned_add_available(mp, ac->reserved_blocks); 259 xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks); 260 } 261 if (ac->open_zone) 262 xfs_open_zone_put(ac->open_zone); 263 } 264