1 // SPDX-License-Identifier: GPL-2.0+ 2 /* 3 * Copyright (C) 2016 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_sb.h" 13 #include "xfs_mount.h" 14 #include "xfs_defer.h" 15 #include "xfs_alloc.h" 16 #include "xfs_errortag.h" 17 #include "xfs_error.h" 18 #include "xfs_trace.h" 19 #include "xfs_cksum.h" 20 #include "xfs_trans.h" 21 #include "xfs_bit.h" 22 #include "xfs_bmap.h" 23 #include "xfs_bmap_btree.h" 24 #include "xfs_ag_resv.h" 25 #include "xfs_trans_space.h" 26 #include "xfs_rmap_btree.h" 27 #include "xfs_btree.h" 28 #include "xfs_refcount_btree.h" 29 #include "xfs_ialloc_btree.h" 30 31 /* 32 * Per-AG Block Reservations 33 * 34 * For some kinds of allocation group metadata structures, it is advantageous 35 * to reserve a small number of blocks in each AG so that future expansions of 36 * that data structure do not encounter ENOSPC because errors during a btree 37 * split cause the filesystem to go offline. 38 * 39 * Prior to the introduction of reflink, this wasn't an issue because the free 40 * space btrees maintain a reserve of space (the AGFL) to handle any expansion 41 * that may be necessary; and allocations of other metadata (inodes, BMBT, 42 * dir/attr) aren't restricted to a single AG. However, with reflink it is 43 * possible to allocate all the space in an AG, have subsequent reflink/CoW 44 * activity expand the refcount btree, and discover that there's no space left 45 * to handle that expansion. Since we can calculate the maximum size of the 46 * refcount btree, we can reserve space for it and avoid ENOSPC. 47 * 48 * Handling per-AG reservations consists of three changes to the allocator's 49 * behavior: First, because these reservations are always needed, we decrease 50 * the ag_max_usable counter to reflect the size of the AG after the reserved 51 * blocks are taken. Second, the reservations must be reflected in the 52 * fdblocks count to maintain proper accounting. Third, each AG must maintain 53 * its own reserved block counter so that we can calculate the amount of space 54 * that must remain free to maintain the reservations. Fourth, the "remaining 55 * reserved blocks" count must be used when calculating the length of the 56 * longest free extent in an AG and to clamp maxlen in the per-AG allocation 57 * functions. In other words, we maintain a virtual allocation via in-core 58 * accounting tricks so that we don't have to clean up after a crash. :) 59 * 60 * Reserved blocks can be managed by passing one of the enum xfs_ag_resv_type 61 * values via struct xfs_alloc_arg or directly to the xfs_free_extent 62 * function. It might seem a little funny to maintain a reservoir of blocks 63 * to feed another reservoir, but the AGFL only holds enough blocks to get 64 * through the next transaction. The per-AG reservation is to ensure (we 65 * hope) that each AG never runs out of blocks. Each data structure wanting 66 * to use the reservation system should update ask/used in xfs_ag_resv_init. 67 */ 68 69 /* 70 * Are we critically low on blocks? For now we'll define that as the number 71 * of blocks we can get our hands on being less than 10% of what we reserved 72 * or less than some arbitrary number (maximum btree height). 73 */ 74 bool 75 xfs_ag_resv_critical( 76 struct xfs_perag *pag, 77 enum xfs_ag_resv_type type) 78 { 79 xfs_extlen_t avail; 80 xfs_extlen_t orig; 81 82 switch (type) { 83 case XFS_AG_RESV_METADATA: 84 avail = pag->pagf_freeblks - pag->pag_rmapbt_resv.ar_reserved; 85 orig = pag->pag_meta_resv.ar_asked; 86 break; 87 case XFS_AG_RESV_RMAPBT: 88 avail = pag->pagf_freeblks + pag->pagf_flcount - 89 pag->pag_meta_resv.ar_reserved; 90 orig = pag->pag_rmapbt_resv.ar_asked; 91 break; 92 default: 93 ASSERT(0); 94 return false; 95 } 96 97 trace_xfs_ag_resv_critical(pag, type, avail); 98 99 /* Critically low if less than 10% or max btree height remains. */ 100 return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS, 101 pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL); 102 } 103 104 /* 105 * How many blocks are reserved but not used, and therefore must not be 106 * allocated away? 107 */ 108 xfs_extlen_t 109 xfs_ag_resv_needed( 110 struct xfs_perag *pag, 111 enum xfs_ag_resv_type type) 112 { 113 xfs_extlen_t len; 114 115 len = pag->pag_meta_resv.ar_reserved + pag->pag_rmapbt_resv.ar_reserved; 116 switch (type) { 117 case XFS_AG_RESV_METADATA: 118 case XFS_AG_RESV_RMAPBT: 119 len -= xfs_perag_resv(pag, type)->ar_reserved; 120 break; 121 case XFS_AG_RESV_NONE: 122 /* empty */ 123 break; 124 default: 125 ASSERT(0); 126 } 127 128 trace_xfs_ag_resv_needed(pag, type, len); 129 130 return len; 131 } 132 133 /* Clean out a reservation */ 134 static int 135 __xfs_ag_resv_free( 136 struct xfs_perag *pag, 137 enum xfs_ag_resv_type type) 138 { 139 struct xfs_ag_resv *resv; 140 xfs_extlen_t oldresv; 141 int error; 142 143 trace_xfs_ag_resv_free(pag, type, 0); 144 145 resv = xfs_perag_resv(pag, type); 146 if (pag->pag_agno == 0) 147 pag->pag_mount->m_ag_max_usable += resv->ar_asked; 148 /* 149 * RMAPBT blocks come from the AGFL and AGFL blocks are always 150 * considered "free", so whatever was reserved at mount time must be 151 * given back at umount. 152 */ 153 if (type == XFS_AG_RESV_RMAPBT) 154 oldresv = resv->ar_orig_reserved; 155 else 156 oldresv = resv->ar_reserved; 157 error = xfs_mod_fdblocks(pag->pag_mount, oldresv, true); 158 resv->ar_reserved = 0; 159 resv->ar_asked = 0; 160 resv->ar_orig_reserved = 0; 161 162 if (error) 163 trace_xfs_ag_resv_free_error(pag->pag_mount, pag->pag_agno, 164 error, _RET_IP_); 165 return error; 166 } 167 168 /* Free a per-AG reservation. */ 169 int 170 xfs_ag_resv_free( 171 struct xfs_perag *pag) 172 { 173 int error; 174 int err2; 175 176 error = __xfs_ag_resv_free(pag, XFS_AG_RESV_RMAPBT); 177 err2 = __xfs_ag_resv_free(pag, XFS_AG_RESV_METADATA); 178 if (err2 && !error) 179 error = err2; 180 return error; 181 } 182 183 static int 184 __xfs_ag_resv_init( 185 struct xfs_perag *pag, 186 enum xfs_ag_resv_type type, 187 xfs_extlen_t ask, 188 xfs_extlen_t used) 189 { 190 struct xfs_mount *mp = pag->pag_mount; 191 struct xfs_ag_resv *resv; 192 int error; 193 xfs_extlen_t hidden_space; 194 195 if (used > ask) 196 ask = used; 197 198 switch (type) { 199 case XFS_AG_RESV_RMAPBT: 200 /* 201 * Space taken by the rmapbt is not subtracted from fdblocks 202 * because the rmapbt lives in the free space. Here we must 203 * subtract the entire reservation from fdblocks so that we 204 * always have blocks available for rmapbt expansion. 205 */ 206 hidden_space = ask; 207 break; 208 case XFS_AG_RESV_METADATA: 209 /* 210 * Space taken by all other metadata btrees are accounted 211 * on-disk as used space. We therefore only hide the space 212 * that is reserved but not used by the trees. 213 */ 214 hidden_space = ask - used; 215 break; 216 default: 217 ASSERT(0); 218 return -EINVAL; 219 } 220 error = xfs_mod_fdblocks(mp, -(int64_t)hidden_space, true); 221 if (error) { 222 trace_xfs_ag_resv_init_error(pag->pag_mount, pag->pag_agno, 223 error, _RET_IP_); 224 xfs_warn(mp, 225 "Per-AG reservation for AG %u failed. Filesystem may run out of space.", 226 pag->pag_agno); 227 return error; 228 } 229 230 /* 231 * Reduce the maximum per-AG allocation length by however much we're 232 * trying to reserve for an AG. Since this is a filesystem-wide 233 * counter, we only make the adjustment for AG 0. This assumes that 234 * there aren't any AGs hungrier for per-AG reservation than AG 0. 235 */ 236 if (pag->pag_agno == 0) 237 mp->m_ag_max_usable -= ask; 238 239 resv = xfs_perag_resv(pag, type); 240 resv->ar_asked = ask; 241 resv->ar_orig_reserved = hidden_space; 242 resv->ar_reserved = ask - used; 243 244 trace_xfs_ag_resv_init(pag, type, ask); 245 return 0; 246 } 247 248 /* Create a per-AG block reservation. */ 249 int 250 xfs_ag_resv_init( 251 struct xfs_perag *pag) 252 { 253 struct xfs_mount *mp = pag->pag_mount; 254 xfs_agnumber_t agno = pag->pag_agno; 255 xfs_extlen_t ask; 256 xfs_extlen_t used; 257 int error = 0; 258 259 /* Create the metadata reservation. */ 260 if (pag->pag_meta_resv.ar_asked == 0) { 261 ask = used = 0; 262 263 error = xfs_refcountbt_calc_reserves(mp, agno, &ask, &used); 264 if (error) 265 goto out; 266 267 error = xfs_finobt_calc_reserves(mp, agno, &ask, &used); 268 if (error) 269 goto out; 270 271 error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, 272 ask, used); 273 if (error) { 274 /* 275 * Because we didn't have per-AG reservations when the 276 * finobt feature was added we might not be able to 277 * reserve all needed blocks. Warn and fall back to the 278 * old and potentially buggy code in that case, but 279 * ensure we do have the reservation for the refcountbt. 280 */ 281 ask = used = 0; 282 283 mp->m_inotbt_nores = true; 284 285 error = xfs_refcountbt_calc_reserves(mp, agno, &ask, 286 &used); 287 if (error) 288 goto out; 289 290 error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA, 291 ask, used); 292 if (error) 293 goto out; 294 } 295 } 296 297 /* Create the RMAPBT metadata reservation */ 298 if (pag->pag_rmapbt_resv.ar_asked == 0) { 299 ask = used = 0; 300 301 error = xfs_rmapbt_calc_reserves(mp, agno, &ask, &used); 302 if (error) 303 goto out; 304 305 error = __xfs_ag_resv_init(pag, XFS_AG_RESV_RMAPBT, ask, used); 306 if (error) 307 goto out; 308 } 309 310 #ifdef DEBUG 311 /* need to read in the AGF for the ASSERT below to work */ 312 error = xfs_alloc_pagf_init(pag->pag_mount, NULL, pag->pag_agno, 0); 313 if (error) 314 return error; 315 316 ASSERT(xfs_perag_resv(pag, XFS_AG_RESV_METADATA)->ar_reserved + 317 xfs_perag_resv(pag, XFS_AG_RESV_RMAPBT)->ar_reserved <= 318 pag->pagf_freeblks + pag->pagf_flcount); 319 #endif 320 out: 321 return error; 322 } 323 324 /* Allocate a block from the reservation. */ 325 void 326 xfs_ag_resv_alloc_extent( 327 struct xfs_perag *pag, 328 enum xfs_ag_resv_type type, 329 struct xfs_alloc_arg *args) 330 { 331 struct xfs_ag_resv *resv; 332 xfs_extlen_t len; 333 uint field; 334 335 trace_xfs_ag_resv_alloc_extent(pag, type, args->len); 336 337 switch (type) { 338 case XFS_AG_RESV_AGFL: 339 return; 340 case XFS_AG_RESV_METADATA: 341 case XFS_AG_RESV_RMAPBT: 342 resv = xfs_perag_resv(pag, type); 343 break; 344 default: 345 ASSERT(0); 346 /* fall through */ 347 case XFS_AG_RESV_NONE: 348 field = args->wasdel ? XFS_TRANS_SB_RES_FDBLOCKS : 349 XFS_TRANS_SB_FDBLOCKS; 350 xfs_trans_mod_sb(args->tp, field, -(int64_t)args->len); 351 return; 352 } 353 354 len = min_t(xfs_extlen_t, args->len, resv->ar_reserved); 355 resv->ar_reserved -= len; 356 if (type == XFS_AG_RESV_RMAPBT) 357 return; 358 /* Allocations of reserved blocks only need on-disk sb updates... */ 359 xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -(int64_t)len); 360 /* ...but non-reserved blocks need in-core and on-disk updates. */ 361 if (args->len > len) 362 xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_FDBLOCKS, 363 -((int64_t)args->len - len)); 364 } 365 366 /* Free a block to the reservation. */ 367 void 368 xfs_ag_resv_free_extent( 369 struct xfs_perag *pag, 370 enum xfs_ag_resv_type type, 371 struct xfs_trans *tp, 372 xfs_extlen_t len) 373 { 374 xfs_extlen_t leftover; 375 struct xfs_ag_resv *resv; 376 377 trace_xfs_ag_resv_free_extent(pag, type, len); 378 379 switch (type) { 380 case XFS_AG_RESV_AGFL: 381 return; 382 case XFS_AG_RESV_METADATA: 383 case XFS_AG_RESV_RMAPBT: 384 resv = xfs_perag_resv(pag, type); 385 break; 386 default: 387 ASSERT(0); 388 /* fall through */ 389 case XFS_AG_RESV_NONE: 390 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, (int64_t)len); 391 return; 392 } 393 394 leftover = min_t(xfs_extlen_t, len, resv->ar_asked - resv->ar_reserved); 395 resv->ar_reserved += leftover; 396 if (type == XFS_AG_RESV_RMAPBT) 397 return; 398 /* Freeing into the reserved pool only requires on-disk update... */ 399 xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, len); 400 /* ...but freeing beyond that requires in-core and on-disk update. */ 401 if (len > leftover) 402 xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len - leftover); 403 } 404