1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_shared.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_defer.h" 15 #include "xfs_btree.h" 16 #include "xfs_rmap.h" 17 #include "xfs_alloc_btree.h" 18 #include "xfs_alloc.h" 19 #include "xfs_extent_busy.h" 20 #include "xfs_errortag.h" 21 #include "xfs_error.h" 22 #include "xfs_trace.h" 23 #include "xfs_trans.h" 24 #include "xfs_buf_item.h" 25 #include "xfs_log.h" 26 #include "xfs_ag.h" 27 #include "xfs_ag_resv.h" 28 #include "xfs_bmap.h" 29 30 struct kmem_cache *xfs_extfree_item_cache; 31 32 struct workqueue_struct *xfs_alloc_wq; 33 34 #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) 35 36 #define XFSA_FIXUP_BNO_OK 1 37 #define XFSA_FIXUP_CNT_OK 2 38 39 STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *); 40 STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *); 41 STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *); 42 43 /* 44 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in 45 * the beginning of the block for a proper header with the location information 46 * and CRC. 47 */ 48 unsigned int 49 xfs_agfl_size( 50 struct xfs_mount *mp) 51 { 52 unsigned int size = mp->m_sb.sb_sectsize; 53 54 if (xfs_has_crc(mp)) 55 size -= sizeof(struct xfs_agfl); 56 57 return size / sizeof(xfs_agblock_t); 58 } 59 60 unsigned int 61 xfs_refc_block( 62 struct xfs_mount *mp) 63 { 64 if (xfs_has_rmapbt(mp)) 65 return XFS_RMAP_BLOCK(mp) + 1; 66 if (xfs_has_finobt(mp)) 67 return XFS_FIBT_BLOCK(mp) + 1; 68 return XFS_IBT_BLOCK(mp) + 1; 69 } 70 71 xfs_extlen_t 72 xfs_prealloc_blocks( 73 struct xfs_mount *mp) 74 { 75 if (xfs_has_reflink(mp)) 76 return xfs_refc_block(mp) + 1; 77 if (xfs_has_rmapbt(mp)) 78 return XFS_RMAP_BLOCK(mp) + 1; 79 if (xfs_has_finobt(mp)) 80 return XFS_FIBT_BLOCK(mp) + 1; 81 return XFS_IBT_BLOCK(mp) + 1; 82 } 83 84 /* 85 * The number of blocks per AG that we withhold from xfs_mod_fdblocks to 86 * guarantee that we can refill the AGFL prior to allocating space in a nearly 87 * full AG. Although the space described by the free space btrees, the 88 * blocks used by the freesp btrees themselves, and the blocks owned by the 89 * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk 90 * free space in the AG drop so low that the free space btrees cannot refill an 91 * empty AGFL up to the minimum level. Rather than grind through empty AGs 92 * until the fs goes down, we subtract this many AG blocks from the incore 93 * fdblocks to ensure user allocation does not overcommit the space the 94 * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to 95 * withhold space from xfs_mod_fdblocks, so we do not account for that here. 96 */ 97 #define XFS_ALLOCBT_AGFL_RESERVE 4 98 99 /* 100 * Compute the number of blocks that we set aside to guarantee the ability to 101 * refill the AGFL and handle a full bmap btree split. 102 * 103 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of 104 * AGF buffer (PV 947395), we place constraints on the relationship among 105 * actual allocations for data blocks, freelist blocks, and potential file data 106 * bmap btree blocks. However, these restrictions may result in no actual space 107 * allocated for a delayed extent, for example, a data block in a certain AG is 108 * allocated but there is no additional block for the additional bmap btree 109 * block due to a split of the bmap btree of the file. The result of this may 110 * lead to an infinite loop when the file gets flushed to disk and all delayed 111 * extents need to be actually allocated. To get around this, we explicitly set 112 * aside a few blocks which will not be reserved in delayed allocation. 113 * 114 * For each AG, we need to reserve enough blocks to replenish a totally empty 115 * AGFL and 4 more to handle a potential split of the file's bmap btree. 116 */ 117 unsigned int 118 xfs_alloc_set_aside( 119 struct xfs_mount *mp) 120 { 121 return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4); 122 } 123 124 /* 125 * When deciding how much space to allocate out of an AG, we limit the 126 * allocation maximum size to the size the AG. However, we cannot use all the 127 * blocks in the AG - some are permanently used by metadata. These 128 * blocks are generally: 129 * - the AG superblock, AGF, AGI and AGFL 130 * - the AGF (bno and cnt) and AGI btree root blocks, and optionally 131 * the AGI free inode and rmap btree root blocks. 132 * - blocks on the AGFL according to xfs_alloc_set_aside() limits 133 * - the rmapbt root block 134 * 135 * The AG headers are sector sized, so the amount of space they take up is 136 * dependent on filesystem geometry. The others are all single blocks. 137 */ 138 unsigned int 139 xfs_alloc_ag_max_usable( 140 struct xfs_mount *mp) 141 { 142 unsigned int blocks; 143 144 blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ 145 blocks += XFS_ALLOCBT_AGFL_RESERVE; 146 blocks += 3; /* AGF, AGI btree root blocks */ 147 if (xfs_has_finobt(mp)) 148 blocks++; /* finobt root block */ 149 if (xfs_has_rmapbt(mp)) 150 blocks++; /* rmap root block */ 151 if (xfs_has_reflink(mp)) 152 blocks++; /* refcount root block */ 153 154 return mp->m_sb.sb_agblocks - blocks; 155 } 156 157 /* 158 * Lookup the record equal to [bno, len] in the btree given by cur. 159 */ 160 STATIC int /* error */ 161 xfs_alloc_lookup_eq( 162 struct xfs_btree_cur *cur, /* btree cursor */ 163 xfs_agblock_t bno, /* starting block of extent */ 164 xfs_extlen_t len, /* length of extent */ 165 int *stat) /* success/failure */ 166 { 167 int error; 168 169 cur->bc_rec.a.ar_startblock = bno; 170 cur->bc_rec.a.ar_blockcount = len; 171 error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); 172 cur->bc_ag.abt.active = (*stat == 1); 173 return error; 174 } 175 176 /* 177 * Lookup the first record greater than or equal to [bno, len] 178 * in the btree given by cur. 179 */ 180 int /* error */ 181 xfs_alloc_lookup_ge( 182 struct xfs_btree_cur *cur, /* btree cursor */ 183 xfs_agblock_t bno, /* starting block of extent */ 184 xfs_extlen_t len, /* length of extent */ 185 int *stat) /* success/failure */ 186 { 187 int error; 188 189 cur->bc_rec.a.ar_startblock = bno; 190 cur->bc_rec.a.ar_blockcount = len; 191 error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); 192 cur->bc_ag.abt.active = (*stat == 1); 193 return error; 194 } 195 196 /* 197 * Lookup the first record less than or equal to [bno, len] 198 * in the btree given by cur. 199 */ 200 int /* error */ 201 xfs_alloc_lookup_le( 202 struct xfs_btree_cur *cur, /* btree cursor */ 203 xfs_agblock_t bno, /* starting block of extent */ 204 xfs_extlen_t len, /* length of extent */ 205 int *stat) /* success/failure */ 206 { 207 int error; 208 cur->bc_rec.a.ar_startblock = bno; 209 cur->bc_rec.a.ar_blockcount = len; 210 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); 211 cur->bc_ag.abt.active = (*stat == 1); 212 return error; 213 } 214 215 static inline bool 216 xfs_alloc_cur_active( 217 struct xfs_btree_cur *cur) 218 { 219 return cur && cur->bc_ag.abt.active; 220 } 221 222 /* 223 * Update the record referred to by cur to the value given 224 * by [bno, len]. 225 * This either works (return 0) or gets an EFSCORRUPTED error. 226 */ 227 STATIC int /* error */ 228 xfs_alloc_update( 229 struct xfs_btree_cur *cur, /* btree cursor */ 230 xfs_agblock_t bno, /* starting block of extent */ 231 xfs_extlen_t len) /* length of extent */ 232 { 233 union xfs_btree_rec rec; 234 235 rec.alloc.ar_startblock = cpu_to_be32(bno); 236 rec.alloc.ar_blockcount = cpu_to_be32(len); 237 return xfs_btree_update(cur, &rec); 238 } 239 240 /* 241 * Get the data from the pointed-to record. 242 */ 243 int /* error */ 244 xfs_alloc_get_rec( 245 struct xfs_btree_cur *cur, /* btree cursor */ 246 xfs_agblock_t *bno, /* output: starting block of extent */ 247 xfs_extlen_t *len, /* output: length of extent */ 248 int *stat) /* output: success/failure */ 249 { 250 struct xfs_mount *mp = cur->bc_mp; 251 struct xfs_perag *pag = cur->bc_ag.pag; 252 union xfs_btree_rec *rec; 253 int error; 254 255 error = xfs_btree_get_rec(cur, &rec, stat); 256 if (error || !(*stat)) 257 return error; 258 259 *bno = be32_to_cpu(rec->alloc.ar_startblock); 260 *len = be32_to_cpu(rec->alloc.ar_blockcount); 261 262 if (*len == 0) 263 goto out_bad_rec; 264 265 /* check for valid extent range, including overflow */ 266 if (!xfs_verify_agbext(pag, *bno, *len)) 267 goto out_bad_rec; 268 269 return 0; 270 271 out_bad_rec: 272 xfs_warn(mp, 273 "%s Freespace BTree record corruption in AG %d detected!", 274 cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", 275 pag->pag_agno); 276 xfs_warn(mp, 277 "start block 0x%x block count 0x%x", *bno, *len); 278 return -EFSCORRUPTED; 279 } 280 281 /* 282 * Compute aligned version of the found extent. 283 * Takes alignment and min length into account. 284 */ 285 STATIC bool 286 xfs_alloc_compute_aligned( 287 xfs_alloc_arg_t *args, /* allocation argument structure */ 288 xfs_agblock_t foundbno, /* starting block in found extent */ 289 xfs_extlen_t foundlen, /* length in found extent */ 290 xfs_agblock_t *resbno, /* result block number */ 291 xfs_extlen_t *reslen, /* result length */ 292 unsigned *busy_gen) 293 { 294 xfs_agblock_t bno = foundbno; 295 xfs_extlen_t len = foundlen; 296 xfs_extlen_t diff; 297 bool busy; 298 299 /* Trim busy sections out of found extent */ 300 busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen); 301 302 /* 303 * If we have a largish extent that happens to start before min_agbno, 304 * see if we can shift it into range... 305 */ 306 if (bno < args->min_agbno && bno + len > args->min_agbno) { 307 diff = args->min_agbno - bno; 308 if (len > diff) { 309 bno += diff; 310 len -= diff; 311 } 312 } 313 314 if (args->alignment > 1 && len >= args->minlen) { 315 xfs_agblock_t aligned_bno = roundup(bno, args->alignment); 316 317 diff = aligned_bno - bno; 318 319 *resbno = aligned_bno; 320 *reslen = diff >= len ? 0 : len - diff; 321 } else { 322 *resbno = bno; 323 *reslen = len; 324 } 325 326 return busy; 327 } 328 329 /* 330 * Compute best start block and diff for "near" allocations. 331 * freelen >= wantlen already checked by caller. 332 */ 333 STATIC xfs_extlen_t /* difference value (absolute) */ 334 xfs_alloc_compute_diff( 335 xfs_agblock_t wantbno, /* target starting block */ 336 xfs_extlen_t wantlen, /* target length */ 337 xfs_extlen_t alignment, /* target alignment */ 338 int datatype, /* are we allocating data? */ 339 xfs_agblock_t freebno, /* freespace's starting block */ 340 xfs_extlen_t freelen, /* freespace's length */ 341 xfs_agblock_t *newbnop) /* result: best start block from free */ 342 { 343 xfs_agblock_t freeend; /* end of freespace extent */ 344 xfs_agblock_t newbno1; /* return block number */ 345 xfs_agblock_t newbno2; /* other new block number */ 346 xfs_extlen_t newlen1=0; /* length with newbno1 */ 347 xfs_extlen_t newlen2=0; /* length with newbno2 */ 348 xfs_agblock_t wantend; /* end of target extent */ 349 bool userdata = datatype & XFS_ALLOC_USERDATA; 350 351 ASSERT(freelen >= wantlen); 352 freeend = freebno + freelen; 353 wantend = wantbno + wantlen; 354 /* 355 * We want to allocate from the start of a free extent if it is past 356 * the desired block or if we are allocating user data and the free 357 * extent is before desired block. The second case is there to allow 358 * for contiguous allocation from the remaining free space if the file 359 * grows in the short term. 360 */ 361 if (freebno >= wantbno || (userdata && freeend < wantend)) { 362 if ((newbno1 = roundup(freebno, alignment)) >= freeend) 363 newbno1 = NULLAGBLOCK; 364 } else if (freeend >= wantend && alignment > 1) { 365 newbno1 = roundup(wantbno, alignment); 366 newbno2 = newbno1 - alignment; 367 if (newbno1 >= freeend) 368 newbno1 = NULLAGBLOCK; 369 else 370 newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1); 371 if (newbno2 < freebno) 372 newbno2 = NULLAGBLOCK; 373 else 374 newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2); 375 if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { 376 if (newlen1 < newlen2 || 377 (newlen1 == newlen2 && 378 XFS_ABSDIFF(newbno1, wantbno) > 379 XFS_ABSDIFF(newbno2, wantbno))) 380 newbno1 = newbno2; 381 } else if (newbno2 != NULLAGBLOCK) 382 newbno1 = newbno2; 383 } else if (freeend >= wantend) { 384 newbno1 = wantbno; 385 } else if (alignment > 1) { 386 newbno1 = roundup(freeend - wantlen, alignment); 387 if (newbno1 > freeend - wantlen && 388 newbno1 - alignment >= freebno) 389 newbno1 -= alignment; 390 else if (newbno1 >= freeend) 391 newbno1 = NULLAGBLOCK; 392 } else 393 newbno1 = freeend - wantlen; 394 *newbnop = newbno1; 395 return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno); 396 } 397 398 /* 399 * Fix up the length, based on mod and prod. 400 * len should be k * prod + mod for some k. 401 * If len is too small it is returned unchanged. 402 * If len hits maxlen it is left alone. 403 */ 404 STATIC void 405 xfs_alloc_fix_len( 406 xfs_alloc_arg_t *args) /* allocation argument structure */ 407 { 408 xfs_extlen_t k; 409 xfs_extlen_t rlen; 410 411 ASSERT(args->mod < args->prod); 412 rlen = args->len; 413 ASSERT(rlen >= args->minlen); 414 ASSERT(rlen <= args->maxlen); 415 if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen || 416 (args->mod == 0 && rlen < args->prod)) 417 return; 418 k = rlen % args->prod; 419 if (k == args->mod) 420 return; 421 if (k > args->mod) 422 rlen = rlen - (k - args->mod); 423 else 424 rlen = rlen - args->prod + (args->mod - k); 425 /* casts to (int) catch length underflows */ 426 if ((int)rlen < (int)args->minlen) 427 return; 428 ASSERT(rlen >= args->minlen && rlen <= args->maxlen); 429 ASSERT(rlen % args->prod == args->mod); 430 ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >= 431 rlen + args->minleft); 432 args->len = rlen; 433 } 434 435 /* 436 * Update the two btrees, logically removing from freespace the extent 437 * starting at rbno, rlen blocks. The extent is contained within the 438 * actual (current) free extent fbno for flen blocks. 439 * Flags are passed in indicating whether the cursors are set to the 440 * relevant records. 441 */ 442 STATIC int /* error code */ 443 xfs_alloc_fixup_trees( 444 struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */ 445 struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */ 446 xfs_agblock_t fbno, /* starting block of free extent */ 447 xfs_extlen_t flen, /* length of free extent */ 448 xfs_agblock_t rbno, /* starting block of returned extent */ 449 xfs_extlen_t rlen, /* length of returned extent */ 450 int flags) /* flags, XFSA_FIXUP_... */ 451 { 452 int error; /* error code */ 453 int i; /* operation results */ 454 xfs_agblock_t nfbno1; /* first new free startblock */ 455 xfs_agblock_t nfbno2; /* second new free startblock */ 456 xfs_extlen_t nflen1=0; /* first new free length */ 457 xfs_extlen_t nflen2=0; /* second new free length */ 458 struct xfs_mount *mp; 459 460 mp = cnt_cur->bc_mp; 461 462 /* 463 * Look up the record in the by-size tree if necessary. 464 */ 465 if (flags & XFSA_FIXUP_CNT_OK) { 466 #ifdef DEBUG 467 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) 468 return error; 469 if (XFS_IS_CORRUPT(mp, 470 i != 1 || 471 nfbno1 != fbno || 472 nflen1 != flen)) 473 return -EFSCORRUPTED; 474 #endif 475 } else { 476 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) 477 return error; 478 if (XFS_IS_CORRUPT(mp, i != 1)) 479 return -EFSCORRUPTED; 480 } 481 /* 482 * Look up the record in the by-block tree if necessary. 483 */ 484 if (flags & XFSA_FIXUP_BNO_OK) { 485 #ifdef DEBUG 486 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) 487 return error; 488 if (XFS_IS_CORRUPT(mp, 489 i != 1 || 490 nfbno1 != fbno || 491 nflen1 != flen)) 492 return -EFSCORRUPTED; 493 #endif 494 } else { 495 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) 496 return error; 497 if (XFS_IS_CORRUPT(mp, i != 1)) 498 return -EFSCORRUPTED; 499 } 500 501 #ifdef DEBUG 502 if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) { 503 struct xfs_btree_block *bnoblock; 504 struct xfs_btree_block *cntblock; 505 506 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp); 507 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp); 508 509 if (XFS_IS_CORRUPT(mp, 510 bnoblock->bb_numrecs != 511 cntblock->bb_numrecs)) 512 return -EFSCORRUPTED; 513 } 514 #endif 515 516 /* 517 * Deal with all four cases: the allocated record is contained 518 * within the freespace record, so we can have new freespace 519 * at either (or both) end, or no freespace remaining. 520 */ 521 if (rbno == fbno && rlen == flen) 522 nfbno1 = nfbno2 = NULLAGBLOCK; 523 else if (rbno == fbno) { 524 nfbno1 = rbno + rlen; 525 nflen1 = flen - rlen; 526 nfbno2 = NULLAGBLOCK; 527 } else if (rbno + rlen == fbno + flen) { 528 nfbno1 = fbno; 529 nflen1 = flen - rlen; 530 nfbno2 = NULLAGBLOCK; 531 } else { 532 nfbno1 = fbno; 533 nflen1 = rbno - fbno; 534 nfbno2 = rbno + rlen; 535 nflen2 = (fbno + flen) - nfbno2; 536 } 537 /* 538 * Delete the entry from the by-size btree. 539 */ 540 if ((error = xfs_btree_delete(cnt_cur, &i))) 541 return error; 542 if (XFS_IS_CORRUPT(mp, i != 1)) 543 return -EFSCORRUPTED; 544 /* 545 * Add new by-size btree entry(s). 546 */ 547 if (nfbno1 != NULLAGBLOCK) { 548 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) 549 return error; 550 if (XFS_IS_CORRUPT(mp, i != 0)) 551 return -EFSCORRUPTED; 552 if ((error = xfs_btree_insert(cnt_cur, &i))) 553 return error; 554 if (XFS_IS_CORRUPT(mp, i != 1)) 555 return -EFSCORRUPTED; 556 } 557 if (nfbno2 != NULLAGBLOCK) { 558 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) 559 return error; 560 if (XFS_IS_CORRUPT(mp, i != 0)) 561 return -EFSCORRUPTED; 562 if ((error = xfs_btree_insert(cnt_cur, &i))) 563 return error; 564 if (XFS_IS_CORRUPT(mp, i != 1)) 565 return -EFSCORRUPTED; 566 } 567 /* 568 * Fix up the by-block btree entry(s). 569 */ 570 if (nfbno1 == NULLAGBLOCK) { 571 /* 572 * No remaining freespace, just delete the by-block tree entry. 573 */ 574 if ((error = xfs_btree_delete(bno_cur, &i))) 575 return error; 576 if (XFS_IS_CORRUPT(mp, i != 1)) 577 return -EFSCORRUPTED; 578 } else { 579 /* 580 * Update the by-block entry to start later|be shorter. 581 */ 582 if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1))) 583 return error; 584 } 585 if (nfbno2 != NULLAGBLOCK) { 586 /* 587 * 2 resulting free entries, need to add one. 588 */ 589 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) 590 return error; 591 if (XFS_IS_CORRUPT(mp, i != 0)) 592 return -EFSCORRUPTED; 593 if ((error = xfs_btree_insert(bno_cur, &i))) 594 return error; 595 if (XFS_IS_CORRUPT(mp, i != 1)) 596 return -EFSCORRUPTED; 597 } 598 return 0; 599 } 600 601 static xfs_failaddr_t 602 xfs_agfl_verify( 603 struct xfs_buf *bp) 604 { 605 struct xfs_mount *mp = bp->b_mount; 606 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); 607 __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp); 608 int i; 609 610 /* 611 * There is no verification of non-crc AGFLs because mkfs does not 612 * initialise the AGFL to zero or NULL. Hence the only valid part of the 613 * AGFL is what the AGF says is active. We can't get to the AGF, so we 614 * can't verify just those entries are valid. 615 */ 616 if (!xfs_has_crc(mp)) 617 return NULL; 618 619 if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) 620 return __this_address; 621 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) 622 return __this_address; 623 /* 624 * during growfs operations, the perag is not fully initialised, 625 * so we can't use it for any useful checking. growfs ensures we can't 626 * use it by using uncached buffers that don't have the perag attached 627 * so we can detect and avoid this problem. 628 */ 629 if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) 630 return __this_address; 631 632 for (i = 0; i < xfs_agfl_size(mp); i++) { 633 if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK && 634 be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks) 635 return __this_address; 636 } 637 638 if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn))) 639 return __this_address; 640 return NULL; 641 } 642 643 static void 644 xfs_agfl_read_verify( 645 struct xfs_buf *bp) 646 { 647 struct xfs_mount *mp = bp->b_mount; 648 xfs_failaddr_t fa; 649 650 /* 651 * There is no verification of non-crc AGFLs because mkfs does not 652 * initialise the AGFL to zero or NULL. Hence the only valid part of the 653 * AGFL is what the AGF says is active. We can't get to the AGF, so we 654 * can't verify just those entries are valid. 655 */ 656 if (!xfs_has_crc(mp)) 657 return; 658 659 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) 660 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 661 else { 662 fa = xfs_agfl_verify(bp); 663 if (fa) 664 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 665 } 666 } 667 668 static void 669 xfs_agfl_write_verify( 670 struct xfs_buf *bp) 671 { 672 struct xfs_mount *mp = bp->b_mount; 673 struct xfs_buf_log_item *bip = bp->b_log_item; 674 xfs_failaddr_t fa; 675 676 /* no verification of non-crc AGFLs */ 677 if (!xfs_has_crc(mp)) 678 return; 679 680 fa = xfs_agfl_verify(bp); 681 if (fa) { 682 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 683 return; 684 } 685 686 if (bip) 687 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); 688 689 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF); 690 } 691 692 const struct xfs_buf_ops xfs_agfl_buf_ops = { 693 .name = "xfs_agfl", 694 .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) }, 695 .verify_read = xfs_agfl_read_verify, 696 .verify_write = xfs_agfl_write_verify, 697 .verify_struct = xfs_agfl_verify, 698 }; 699 700 /* 701 * Read in the allocation group free block array. 702 */ 703 int 704 xfs_alloc_read_agfl( 705 struct xfs_perag *pag, 706 struct xfs_trans *tp, 707 struct xfs_buf **bpp) 708 { 709 struct xfs_mount *mp = pag->pag_mount; 710 struct xfs_buf *bp; 711 int error; 712 713 error = xfs_trans_read_buf( 714 mp, tp, mp->m_ddev_targp, 715 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)), 716 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); 717 if (error) 718 return error; 719 xfs_buf_set_ref(bp, XFS_AGFL_REF); 720 *bpp = bp; 721 return 0; 722 } 723 724 STATIC int 725 xfs_alloc_update_counters( 726 struct xfs_trans *tp, 727 struct xfs_buf *agbp, 728 long len) 729 { 730 struct xfs_agf *agf = agbp->b_addr; 731 732 agbp->b_pag->pagf_freeblks += len; 733 be32_add_cpu(&agf->agf_freeblks, len); 734 735 if (unlikely(be32_to_cpu(agf->agf_freeblks) > 736 be32_to_cpu(agf->agf_length))) { 737 xfs_buf_mark_corrupt(agbp); 738 return -EFSCORRUPTED; 739 } 740 741 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); 742 return 0; 743 } 744 745 /* 746 * Block allocation algorithm and data structures. 747 */ 748 struct xfs_alloc_cur { 749 struct xfs_btree_cur *cnt; /* btree cursors */ 750 struct xfs_btree_cur *bnolt; 751 struct xfs_btree_cur *bnogt; 752 xfs_extlen_t cur_len;/* current search length */ 753 xfs_agblock_t rec_bno;/* extent startblock */ 754 xfs_extlen_t rec_len;/* extent length */ 755 xfs_agblock_t bno; /* alloc bno */ 756 xfs_extlen_t len; /* alloc len */ 757 xfs_extlen_t diff; /* diff from search bno */ 758 unsigned int busy_gen;/* busy state */ 759 bool busy; 760 }; 761 762 /* 763 * Set up cursors, etc. in the extent allocation cursor. This function can be 764 * called multiple times to reset an initialized structure without having to 765 * reallocate cursors. 766 */ 767 static int 768 xfs_alloc_cur_setup( 769 struct xfs_alloc_arg *args, 770 struct xfs_alloc_cur *acur) 771 { 772 int error; 773 int i; 774 775 ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO); 776 777 acur->cur_len = args->maxlen; 778 acur->rec_bno = 0; 779 acur->rec_len = 0; 780 acur->bno = 0; 781 acur->len = 0; 782 acur->diff = -1; 783 acur->busy = false; 784 acur->busy_gen = 0; 785 786 /* 787 * Perform an initial cntbt lookup to check for availability of maxlen 788 * extents. If this fails, we'll return -ENOSPC to signal the caller to 789 * attempt a small allocation. 790 */ 791 if (!acur->cnt) 792 acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp, 793 args->agbp, args->pag, XFS_BTNUM_CNT); 794 error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); 795 if (error) 796 return error; 797 798 /* 799 * Allocate the bnobt left and right search cursors. 800 */ 801 if (!acur->bnolt) 802 acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp, 803 args->agbp, args->pag, XFS_BTNUM_BNO); 804 if (!acur->bnogt) 805 acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp, 806 args->agbp, args->pag, XFS_BTNUM_BNO); 807 return i == 1 ? 0 : -ENOSPC; 808 } 809 810 static void 811 xfs_alloc_cur_close( 812 struct xfs_alloc_cur *acur, 813 bool error) 814 { 815 int cur_error = XFS_BTREE_NOERROR; 816 817 if (error) 818 cur_error = XFS_BTREE_ERROR; 819 820 if (acur->cnt) 821 xfs_btree_del_cursor(acur->cnt, cur_error); 822 if (acur->bnolt) 823 xfs_btree_del_cursor(acur->bnolt, cur_error); 824 if (acur->bnogt) 825 xfs_btree_del_cursor(acur->bnogt, cur_error); 826 acur->cnt = acur->bnolt = acur->bnogt = NULL; 827 } 828 829 /* 830 * Check an extent for allocation and track the best available candidate in the 831 * allocation structure. The cursor is deactivated if it has entered an out of 832 * range state based on allocation arguments. Optionally return the extent 833 * extent geometry and allocation status if requested by the caller. 834 */ 835 static int 836 xfs_alloc_cur_check( 837 struct xfs_alloc_arg *args, 838 struct xfs_alloc_cur *acur, 839 struct xfs_btree_cur *cur, 840 int *new) 841 { 842 int error, i; 843 xfs_agblock_t bno, bnoa, bnew; 844 xfs_extlen_t len, lena, diff = -1; 845 bool busy; 846 unsigned busy_gen = 0; 847 bool deactivate = false; 848 bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO; 849 850 *new = 0; 851 852 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 853 if (error) 854 return error; 855 if (XFS_IS_CORRUPT(args->mp, i != 1)) 856 return -EFSCORRUPTED; 857 858 /* 859 * Check minlen and deactivate a cntbt cursor if out of acceptable size 860 * range (i.e., walking backwards looking for a minlen extent). 861 */ 862 if (len < args->minlen) { 863 deactivate = !isbnobt; 864 goto out; 865 } 866 867 busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena, 868 &busy_gen); 869 acur->busy |= busy; 870 if (busy) 871 acur->busy_gen = busy_gen; 872 /* deactivate a bnobt cursor outside of locality range */ 873 if (bnoa < args->min_agbno || bnoa > args->max_agbno) { 874 deactivate = isbnobt; 875 goto out; 876 } 877 if (lena < args->minlen) 878 goto out; 879 880 args->len = XFS_EXTLEN_MIN(lena, args->maxlen); 881 xfs_alloc_fix_len(args); 882 ASSERT(args->len >= args->minlen); 883 if (args->len < acur->len) 884 goto out; 885 886 /* 887 * We have an aligned record that satisfies minlen and beats or matches 888 * the candidate extent size. Compare locality for near allocation mode. 889 */ 890 ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO); 891 diff = xfs_alloc_compute_diff(args->agbno, args->len, 892 args->alignment, args->datatype, 893 bnoa, lena, &bnew); 894 if (bnew == NULLAGBLOCK) 895 goto out; 896 897 /* 898 * Deactivate a bnobt cursor with worse locality than the current best. 899 */ 900 if (diff > acur->diff) { 901 deactivate = isbnobt; 902 goto out; 903 } 904 905 ASSERT(args->len > acur->len || 906 (args->len == acur->len && diff <= acur->diff)); 907 acur->rec_bno = bno; 908 acur->rec_len = len; 909 acur->bno = bnew; 910 acur->len = args->len; 911 acur->diff = diff; 912 *new = 1; 913 914 /* 915 * We're done if we found a perfect allocation. This only deactivates 916 * the current cursor, but this is just an optimization to terminate a 917 * cntbt search that otherwise runs to the edge of the tree. 918 */ 919 if (acur->diff == 0 && acur->len == args->maxlen) 920 deactivate = true; 921 out: 922 if (deactivate) 923 cur->bc_ag.abt.active = false; 924 trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff, 925 *new); 926 return 0; 927 } 928 929 /* 930 * Complete an allocation of a candidate extent. Remove the extent from both 931 * trees and update the args structure. 932 */ 933 STATIC int 934 xfs_alloc_cur_finish( 935 struct xfs_alloc_arg *args, 936 struct xfs_alloc_cur *acur) 937 { 938 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; 939 int error; 940 941 ASSERT(acur->cnt && acur->bnolt); 942 ASSERT(acur->bno >= acur->rec_bno); 943 ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); 944 ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length)); 945 946 error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, 947 acur->rec_len, acur->bno, acur->len, 0); 948 if (error) 949 return error; 950 951 args->agbno = acur->bno; 952 args->len = acur->len; 953 args->wasfromfl = 0; 954 955 trace_xfs_alloc_cur(args); 956 return 0; 957 } 958 959 /* 960 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses 961 * bno optimized lookup to search for extents with ideal size and locality. 962 */ 963 STATIC int 964 xfs_alloc_cntbt_iter( 965 struct xfs_alloc_arg *args, 966 struct xfs_alloc_cur *acur) 967 { 968 struct xfs_btree_cur *cur = acur->cnt; 969 xfs_agblock_t bno; 970 xfs_extlen_t len, cur_len; 971 int error; 972 int i; 973 974 if (!xfs_alloc_cur_active(cur)) 975 return 0; 976 977 /* locality optimized lookup */ 978 cur_len = acur->cur_len; 979 error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i); 980 if (error) 981 return error; 982 if (i == 0) 983 return 0; 984 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 985 if (error) 986 return error; 987 988 /* check the current record and update search length from it */ 989 error = xfs_alloc_cur_check(args, acur, cur, &i); 990 if (error) 991 return error; 992 ASSERT(len >= acur->cur_len); 993 acur->cur_len = len; 994 995 /* 996 * We looked up the first record >= [agbno, len] above. The agbno is a 997 * secondary key and so the current record may lie just before or after 998 * agbno. If it is past agbno, check the previous record too so long as 999 * the length matches as it may be closer. Don't check a smaller record 1000 * because that could deactivate our cursor. 1001 */ 1002 if (bno > args->agbno) { 1003 error = xfs_btree_decrement(cur, 0, &i); 1004 if (!error && i) { 1005 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 1006 if (!error && i && len == acur->cur_len) 1007 error = xfs_alloc_cur_check(args, acur, cur, 1008 &i); 1009 } 1010 if (error) 1011 return error; 1012 } 1013 1014 /* 1015 * Increment the search key until we find at least one allocation 1016 * candidate or if the extent we found was larger. Otherwise, double the 1017 * search key to optimize the search. Efficiency is more important here 1018 * than absolute best locality. 1019 */ 1020 cur_len <<= 1; 1021 if (!acur->len || acur->cur_len >= cur_len) 1022 acur->cur_len++; 1023 else 1024 acur->cur_len = cur_len; 1025 1026 return error; 1027 } 1028 1029 /* 1030 * Deal with the case where only small freespaces remain. Either return the 1031 * contents of the last freespace record, or allocate space from the freelist if 1032 * there is nothing in the tree. 1033 */ 1034 STATIC int /* error */ 1035 xfs_alloc_ag_vextent_small( 1036 struct xfs_alloc_arg *args, /* allocation argument structure */ 1037 struct xfs_btree_cur *ccur, /* optional by-size cursor */ 1038 xfs_agblock_t *fbnop, /* result block number */ 1039 xfs_extlen_t *flenp, /* result length */ 1040 int *stat) /* status: 0-freelist, 1-normal/none */ 1041 { 1042 struct xfs_agf *agf = args->agbp->b_addr; 1043 int error = 0; 1044 xfs_agblock_t fbno = NULLAGBLOCK; 1045 xfs_extlen_t flen = 0; 1046 int i = 0; 1047 1048 /* 1049 * If a cntbt cursor is provided, try to allocate the largest record in 1050 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the 1051 * allocation. Make sure to respect minleft even when pulling from the 1052 * freelist. 1053 */ 1054 if (ccur) 1055 error = xfs_btree_decrement(ccur, 0, &i); 1056 if (error) 1057 goto error; 1058 if (i) { 1059 error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i); 1060 if (error) 1061 goto error; 1062 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1063 error = -EFSCORRUPTED; 1064 goto error; 1065 } 1066 goto out; 1067 } 1068 1069 if (args->minlen != 1 || args->alignment != 1 || 1070 args->resv == XFS_AG_RESV_AGFL || 1071 be32_to_cpu(agf->agf_flcount) <= args->minleft) 1072 goto out; 1073 1074 error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp, 1075 &fbno, 0); 1076 if (error) 1077 goto error; 1078 if (fbno == NULLAGBLOCK) 1079 goto out; 1080 1081 xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1, 1082 (args->datatype & XFS_ALLOC_NOBUSY)); 1083 1084 if (args->datatype & XFS_ALLOC_USERDATA) { 1085 struct xfs_buf *bp; 1086 1087 error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp, 1088 XFS_AGB_TO_DADDR(args->mp, args->agno, fbno), 1089 args->mp->m_bsize, 0, &bp); 1090 if (error) 1091 goto error; 1092 xfs_trans_binval(args->tp, bp); 1093 } 1094 *fbnop = args->agbno = fbno; 1095 *flenp = args->len = 1; 1096 if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) { 1097 error = -EFSCORRUPTED; 1098 goto error; 1099 } 1100 args->wasfromfl = 1; 1101 trace_xfs_alloc_small_freelist(args); 1102 1103 /* 1104 * If we're feeding an AGFL block to something that doesn't live in the 1105 * free space, we need to clear out the OWN_AG rmap. 1106 */ 1107 error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1, 1108 &XFS_RMAP_OINFO_AG); 1109 if (error) 1110 goto error; 1111 1112 *stat = 0; 1113 return 0; 1114 1115 out: 1116 /* 1117 * Can't do the allocation, give up. 1118 */ 1119 if (flen < args->minlen) { 1120 args->agbno = NULLAGBLOCK; 1121 trace_xfs_alloc_small_notenough(args); 1122 flen = 0; 1123 } 1124 *fbnop = fbno; 1125 *flenp = flen; 1126 *stat = 1; 1127 trace_xfs_alloc_small_done(args); 1128 return 0; 1129 1130 error: 1131 trace_xfs_alloc_small_error(args); 1132 return error; 1133 } 1134 1135 /* 1136 * Allocate a variable extent in the allocation group agno. 1137 * Type and bno are used to determine where in the allocation group the 1138 * extent will start. 1139 * Extent's length (returned in *len) will be between minlen and maxlen, 1140 * and of the form k * prod + mod unless there's nothing that large. 1141 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1142 */ 1143 STATIC int /* error */ 1144 xfs_alloc_ag_vextent( 1145 xfs_alloc_arg_t *args) /* argument structure for allocation */ 1146 { 1147 int error=0; 1148 1149 ASSERT(args->minlen > 0); 1150 ASSERT(args->maxlen > 0); 1151 ASSERT(args->minlen <= args->maxlen); 1152 ASSERT(args->mod < args->prod); 1153 ASSERT(args->alignment > 0); 1154 1155 /* 1156 * Branch to correct routine based on the type. 1157 */ 1158 args->wasfromfl = 0; 1159 switch (args->type) { 1160 case XFS_ALLOCTYPE_THIS_AG: 1161 error = xfs_alloc_ag_vextent_size(args); 1162 break; 1163 case XFS_ALLOCTYPE_NEAR_BNO: 1164 error = xfs_alloc_ag_vextent_near(args); 1165 break; 1166 case XFS_ALLOCTYPE_THIS_BNO: 1167 error = xfs_alloc_ag_vextent_exact(args); 1168 break; 1169 default: 1170 ASSERT(0); 1171 /* NOTREACHED */ 1172 } 1173 1174 if (error || args->agbno == NULLAGBLOCK) 1175 return error; 1176 1177 ASSERT(args->len >= args->minlen); 1178 ASSERT(args->len <= args->maxlen); 1179 ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL); 1180 ASSERT(args->agbno % args->alignment == 0); 1181 1182 /* if not file data, insert new block into the reverse map btree */ 1183 if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { 1184 error = xfs_rmap_alloc(args->tp, args->agbp, args->pag, 1185 args->agbno, args->len, &args->oinfo); 1186 if (error) 1187 return error; 1188 } 1189 1190 if (!args->wasfromfl) { 1191 error = xfs_alloc_update_counters(args->tp, args->agbp, 1192 -((long)(args->len))); 1193 if (error) 1194 return error; 1195 1196 ASSERT(!xfs_extent_busy_search(args->mp, args->pag, 1197 args->agbno, args->len)); 1198 } 1199 1200 xfs_ag_resv_alloc_extent(args->pag, args->resv, args); 1201 1202 XFS_STATS_INC(args->mp, xs_allocx); 1203 XFS_STATS_ADD(args->mp, xs_allocb, args->len); 1204 return error; 1205 } 1206 1207 /* 1208 * Allocate a variable extent at exactly agno/bno. 1209 * Extent's length (returned in *len) will be between minlen and maxlen, 1210 * and of the form k * prod + mod unless there's nothing that large. 1211 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it. 1212 */ 1213 STATIC int /* error */ 1214 xfs_alloc_ag_vextent_exact( 1215 xfs_alloc_arg_t *args) /* allocation argument structure */ 1216 { 1217 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; 1218 struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */ 1219 struct xfs_btree_cur *cnt_cur;/* by count btree cursor */ 1220 int error; 1221 xfs_agblock_t fbno; /* start block of found extent */ 1222 xfs_extlen_t flen; /* length of found extent */ 1223 xfs_agblock_t tbno; /* start block of busy extent */ 1224 xfs_extlen_t tlen; /* length of busy extent */ 1225 xfs_agblock_t tend; /* end block of busy extent */ 1226 int i; /* success/failure of operation */ 1227 unsigned busy_gen; 1228 1229 ASSERT(args->alignment == 1); 1230 1231 /* 1232 * Allocate/initialize a cursor for the by-number freespace btree. 1233 */ 1234 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1235 args->pag, XFS_BTNUM_BNO); 1236 1237 /* 1238 * Lookup bno and minlen in the btree (minlen is irrelevant, really). 1239 * Look for the closest free block <= bno, it must contain bno 1240 * if any free block does. 1241 */ 1242 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i); 1243 if (error) 1244 goto error0; 1245 if (!i) 1246 goto not_found; 1247 1248 /* 1249 * Grab the freespace record. 1250 */ 1251 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); 1252 if (error) 1253 goto error0; 1254 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1255 error = -EFSCORRUPTED; 1256 goto error0; 1257 } 1258 ASSERT(fbno <= args->agbno); 1259 1260 /* 1261 * Check for overlapping busy extents. 1262 */ 1263 tbno = fbno; 1264 tlen = flen; 1265 xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen); 1266 1267 /* 1268 * Give up if the start of the extent is busy, or the freespace isn't 1269 * long enough for the minimum request. 1270 */ 1271 if (tbno > args->agbno) 1272 goto not_found; 1273 if (tlen < args->minlen) 1274 goto not_found; 1275 tend = tbno + tlen; 1276 if (tend < args->agbno + args->minlen) 1277 goto not_found; 1278 1279 /* 1280 * End of extent will be smaller of the freespace end and the 1281 * maximal requested end. 1282 * 1283 * Fix the length according to mod and prod if given. 1284 */ 1285 args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen) 1286 - args->agbno; 1287 xfs_alloc_fix_len(args); 1288 ASSERT(args->agbno + args->len <= tend); 1289 1290 /* 1291 * We are allocating agbno for args->len 1292 * Allocate/initialize a cursor for the by-size btree. 1293 */ 1294 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1295 args->pag, XFS_BTNUM_CNT); 1296 ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); 1297 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, 1298 args->len, XFSA_FIXUP_BNO_OK); 1299 if (error) { 1300 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 1301 goto error0; 1302 } 1303 1304 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1305 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1306 1307 args->wasfromfl = 0; 1308 trace_xfs_alloc_exact_done(args); 1309 return 0; 1310 1311 not_found: 1312 /* Didn't find it, return null. */ 1313 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1314 args->agbno = NULLAGBLOCK; 1315 trace_xfs_alloc_exact_notfound(args); 1316 return 0; 1317 1318 error0: 1319 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1320 trace_xfs_alloc_exact_error(args); 1321 return error; 1322 } 1323 1324 /* 1325 * Search a given number of btree records in a given direction. Check each 1326 * record against the good extent we've already found. 1327 */ 1328 STATIC int 1329 xfs_alloc_walk_iter( 1330 struct xfs_alloc_arg *args, 1331 struct xfs_alloc_cur *acur, 1332 struct xfs_btree_cur *cur, 1333 bool increment, 1334 bool find_one, /* quit on first candidate */ 1335 int count, /* rec count (-1 for infinite) */ 1336 int *stat) 1337 { 1338 int error; 1339 int i; 1340 1341 *stat = 0; 1342 1343 /* 1344 * Search so long as the cursor is active or we find a better extent. 1345 * The cursor is deactivated if it extends beyond the range of the 1346 * current allocation candidate. 1347 */ 1348 while (xfs_alloc_cur_active(cur) && count) { 1349 error = xfs_alloc_cur_check(args, acur, cur, &i); 1350 if (error) 1351 return error; 1352 if (i == 1) { 1353 *stat = 1; 1354 if (find_one) 1355 break; 1356 } 1357 if (!xfs_alloc_cur_active(cur)) 1358 break; 1359 1360 if (increment) 1361 error = xfs_btree_increment(cur, 0, &i); 1362 else 1363 error = xfs_btree_decrement(cur, 0, &i); 1364 if (error) 1365 return error; 1366 if (i == 0) 1367 cur->bc_ag.abt.active = false; 1368 1369 if (count > 0) 1370 count--; 1371 } 1372 1373 return 0; 1374 } 1375 1376 /* 1377 * Search the by-bno and by-size btrees in parallel in search of an extent with 1378 * ideal locality based on the NEAR mode ->agbno locality hint. 1379 */ 1380 STATIC int 1381 xfs_alloc_ag_vextent_locality( 1382 struct xfs_alloc_arg *args, 1383 struct xfs_alloc_cur *acur, 1384 int *stat) 1385 { 1386 struct xfs_btree_cur *fbcur = NULL; 1387 int error; 1388 int i; 1389 bool fbinc; 1390 1391 ASSERT(acur->len == 0); 1392 ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO); 1393 1394 *stat = 0; 1395 1396 error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i); 1397 if (error) 1398 return error; 1399 error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i); 1400 if (error) 1401 return error; 1402 error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i); 1403 if (error) 1404 return error; 1405 1406 /* 1407 * Search the bnobt and cntbt in parallel. Search the bnobt left and 1408 * right and lookup the closest extent to the locality hint for each 1409 * extent size key in the cntbt. The entire search terminates 1410 * immediately on a bnobt hit because that means we've found best case 1411 * locality. Otherwise the search continues until the cntbt cursor runs 1412 * off the end of the tree. If no allocation candidate is found at this 1413 * point, give up on locality, walk backwards from the end of the cntbt 1414 * and take the first available extent. 1415 * 1416 * The parallel tree searches balance each other out to provide fairly 1417 * consistent performance for various situations. The bnobt search can 1418 * have pathological behavior in the worst case scenario of larger 1419 * allocation requests and fragmented free space. On the other hand, the 1420 * bnobt is able to satisfy most smaller allocation requests much more 1421 * quickly than the cntbt. The cntbt search can sift through fragmented 1422 * free space and sets of free extents for larger allocation requests 1423 * more quickly than the bnobt. Since the locality hint is just a hint 1424 * and we don't want to scan the entire bnobt for perfect locality, the 1425 * cntbt search essentially bounds the bnobt search such that we can 1426 * find good enough locality at reasonable performance in most cases. 1427 */ 1428 while (xfs_alloc_cur_active(acur->bnolt) || 1429 xfs_alloc_cur_active(acur->bnogt) || 1430 xfs_alloc_cur_active(acur->cnt)) { 1431 1432 trace_xfs_alloc_cur_lookup(args); 1433 1434 /* 1435 * Search the bnobt left and right. In the case of a hit, finish 1436 * the search in the opposite direction and we're done. 1437 */ 1438 error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false, 1439 true, 1, &i); 1440 if (error) 1441 return error; 1442 if (i == 1) { 1443 trace_xfs_alloc_cur_left(args); 1444 fbcur = acur->bnogt; 1445 fbinc = true; 1446 break; 1447 } 1448 error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true, 1449 1, &i); 1450 if (error) 1451 return error; 1452 if (i == 1) { 1453 trace_xfs_alloc_cur_right(args); 1454 fbcur = acur->bnolt; 1455 fbinc = false; 1456 break; 1457 } 1458 1459 /* 1460 * Check the extent with best locality based on the current 1461 * extent size search key and keep track of the best candidate. 1462 */ 1463 error = xfs_alloc_cntbt_iter(args, acur); 1464 if (error) 1465 return error; 1466 if (!xfs_alloc_cur_active(acur->cnt)) { 1467 trace_xfs_alloc_cur_lookup_done(args); 1468 break; 1469 } 1470 } 1471 1472 /* 1473 * If we failed to find anything due to busy extents, return empty 1474 * handed so the caller can flush and retry. If no busy extents were 1475 * found, walk backwards from the end of the cntbt as a last resort. 1476 */ 1477 if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) { 1478 error = xfs_btree_decrement(acur->cnt, 0, &i); 1479 if (error) 1480 return error; 1481 if (i) { 1482 acur->cnt->bc_ag.abt.active = true; 1483 fbcur = acur->cnt; 1484 fbinc = false; 1485 } 1486 } 1487 1488 /* 1489 * Search in the opposite direction for a better entry in the case of 1490 * a bnobt hit or walk backwards from the end of the cntbt. 1491 */ 1492 if (fbcur) { 1493 error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1, 1494 &i); 1495 if (error) 1496 return error; 1497 } 1498 1499 if (acur->len) 1500 *stat = 1; 1501 1502 return 0; 1503 } 1504 1505 /* Check the last block of the cnt btree for allocations. */ 1506 static int 1507 xfs_alloc_ag_vextent_lastblock( 1508 struct xfs_alloc_arg *args, 1509 struct xfs_alloc_cur *acur, 1510 xfs_agblock_t *bno, 1511 xfs_extlen_t *len, 1512 bool *allocated) 1513 { 1514 int error; 1515 int i; 1516 1517 #ifdef DEBUG 1518 /* Randomly don't execute the first algorithm. */ 1519 if (get_random_u32_below(2)) 1520 return 0; 1521 #endif 1522 1523 /* 1524 * Start from the entry that lookup found, sequence through all larger 1525 * free blocks. If we're actually pointing at a record smaller than 1526 * maxlen, go to the start of this block, and skip all those smaller 1527 * than minlen. 1528 */ 1529 if (*len || args->alignment > 1) { 1530 acur->cnt->bc_levels[0].ptr = 1; 1531 do { 1532 error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); 1533 if (error) 1534 return error; 1535 if (XFS_IS_CORRUPT(args->mp, i != 1)) 1536 return -EFSCORRUPTED; 1537 if (*len >= args->minlen) 1538 break; 1539 error = xfs_btree_increment(acur->cnt, 0, &i); 1540 if (error) 1541 return error; 1542 } while (i); 1543 ASSERT(*len >= args->minlen); 1544 if (!i) 1545 return 0; 1546 } 1547 1548 error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i); 1549 if (error) 1550 return error; 1551 1552 /* 1553 * It didn't work. We COULD be in a case where there's a good record 1554 * somewhere, so try again. 1555 */ 1556 if (acur->len == 0) 1557 return 0; 1558 1559 trace_xfs_alloc_near_first(args); 1560 *allocated = true; 1561 return 0; 1562 } 1563 1564 /* 1565 * Allocate a variable extent near bno in the allocation group agno. 1566 * Extent's length (returned in len) will be between minlen and maxlen, 1567 * and of the form k * prod + mod unless there's nothing that large. 1568 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1569 */ 1570 STATIC int 1571 xfs_alloc_ag_vextent_near( 1572 struct xfs_alloc_arg *args) 1573 { 1574 struct xfs_alloc_cur acur = {}; 1575 int error; /* error code */ 1576 int i; /* result code, temporary */ 1577 xfs_agblock_t bno; 1578 xfs_extlen_t len; 1579 1580 /* handle uninitialized agbno range so caller doesn't have to */ 1581 if (!args->min_agbno && !args->max_agbno) 1582 args->max_agbno = args->mp->m_sb.sb_agblocks - 1; 1583 ASSERT(args->min_agbno <= args->max_agbno); 1584 1585 /* clamp agbno to the range if it's outside */ 1586 if (args->agbno < args->min_agbno) 1587 args->agbno = args->min_agbno; 1588 if (args->agbno > args->max_agbno) 1589 args->agbno = args->max_agbno; 1590 1591 restart: 1592 len = 0; 1593 1594 /* 1595 * Set up cursors and see if there are any free extents as big as 1596 * maxlen. If not, pick the last entry in the tree unless the tree is 1597 * empty. 1598 */ 1599 error = xfs_alloc_cur_setup(args, &acur); 1600 if (error == -ENOSPC) { 1601 error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno, 1602 &len, &i); 1603 if (error) 1604 goto out; 1605 if (i == 0 || len == 0) { 1606 trace_xfs_alloc_near_noentry(args); 1607 goto out; 1608 } 1609 ASSERT(i == 1); 1610 } else if (error) { 1611 goto out; 1612 } 1613 1614 /* 1615 * First algorithm. 1616 * If the requested extent is large wrt the freespaces available 1617 * in this a.g., then the cursor will be pointing to a btree entry 1618 * near the right edge of the tree. If it's in the last btree leaf 1619 * block, then we just examine all the entries in that block 1620 * that are big enough, and pick the best one. 1621 */ 1622 if (xfs_btree_islastblock(acur.cnt, 0)) { 1623 bool allocated = false; 1624 1625 error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len, 1626 &allocated); 1627 if (error) 1628 goto out; 1629 if (allocated) 1630 goto alloc_finish; 1631 } 1632 1633 /* 1634 * Second algorithm. Combined cntbt and bnobt search to find ideal 1635 * locality. 1636 */ 1637 error = xfs_alloc_ag_vextent_locality(args, &acur, &i); 1638 if (error) 1639 goto out; 1640 1641 /* 1642 * If we couldn't get anything, give up. 1643 */ 1644 if (!acur.len) { 1645 if (acur.busy) { 1646 trace_xfs_alloc_near_busy(args); 1647 xfs_extent_busy_flush(args->mp, args->pag, 1648 acur.busy_gen); 1649 goto restart; 1650 } 1651 trace_xfs_alloc_size_neither(args); 1652 args->agbno = NULLAGBLOCK; 1653 goto out; 1654 } 1655 1656 alloc_finish: 1657 /* fix up btrees on a successful allocation */ 1658 error = xfs_alloc_cur_finish(args, &acur); 1659 1660 out: 1661 xfs_alloc_cur_close(&acur, error); 1662 return error; 1663 } 1664 1665 /* 1666 * Allocate a variable extent anywhere in the allocation group agno. 1667 * Extent's length (returned in len) will be between minlen and maxlen, 1668 * and of the form k * prod + mod unless there's nothing that large. 1669 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1670 */ 1671 STATIC int /* error */ 1672 xfs_alloc_ag_vextent_size( 1673 xfs_alloc_arg_t *args) /* allocation argument structure */ 1674 { 1675 struct xfs_agf *agf = args->agbp->b_addr; 1676 struct xfs_btree_cur *bno_cur; /* cursor for bno btree */ 1677 struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */ 1678 int error; /* error result */ 1679 xfs_agblock_t fbno; /* start of found freespace */ 1680 xfs_extlen_t flen; /* length of found freespace */ 1681 int i; /* temp status variable */ 1682 xfs_agblock_t rbno; /* returned block number */ 1683 xfs_extlen_t rlen; /* length of returned extent */ 1684 bool busy; 1685 unsigned busy_gen; 1686 1687 restart: 1688 /* 1689 * Allocate and initialize a cursor for the by-size btree. 1690 */ 1691 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1692 args->pag, XFS_BTNUM_CNT); 1693 bno_cur = NULL; 1694 1695 /* 1696 * Look for an entry >= maxlen+alignment-1 blocks. 1697 */ 1698 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, 1699 args->maxlen + args->alignment - 1, &i))) 1700 goto error0; 1701 1702 /* 1703 * If none then we have to settle for a smaller extent. In the case that 1704 * there are no large extents, this will return the last entry in the 1705 * tree unless the tree is empty. In the case that there are only busy 1706 * large extents, this will return the largest small extent unless there 1707 * are no smaller extents available. 1708 */ 1709 if (!i) { 1710 error = xfs_alloc_ag_vextent_small(args, cnt_cur, 1711 &fbno, &flen, &i); 1712 if (error) 1713 goto error0; 1714 if (i == 0 || flen == 0) { 1715 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1716 trace_xfs_alloc_size_noentry(args); 1717 return 0; 1718 } 1719 ASSERT(i == 1); 1720 busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, 1721 &rlen, &busy_gen); 1722 } else { 1723 /* 1724 * Search for a non-busy extent that is large enough. 1725 */ 1726 for (;;) { 1727 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); 1728 if (error) 1729 goto error0; 1730 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1731 error = -EFSCORRUPTED; 1732 goto error0; 1733 } 1734 1735 busy = xfs_alloc_compute_aligned(args, fbno, flen, 1736 &rbno, &rlen, &busy_gen); 1737 1738 if (rlen >= args->maxlen) 1739 break; 1740 1741 error = xfs_btree_increment(cnt_cur, 0, &i); 1742 if (error) 1743 goto error0; 1744 if (i == 0) { 1745 /* 1746 * Our only valid extents must have been busy. 1747 * Make it unbusy by forcing the log out and 1748 * retrying. 1749 */ 1750 xfs_btree_del_cursor(cnt_cur, 1751 XFS_BTREE_NOERROR); 1752 trace_xfs_alloc_size_busy(args); 1753 xfs_extent_busy_flush(args->mp, 1754 args->pag, busy_gen); 1755 goto restart; 1756 } 1757 } 1758 } 1759 1760 /* 1761 * In the first case above, we got the last entry in the 1762 * by-size btree. Now we check to see if the space hits maxlen 1763 * once aligned; if not, we search left for something better. 1764 * This can't happen in the second case above. 1765 */ 1766 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1767 if (XFS_IS_CORRUPT(args->mp, 1768 rlen != 0 && 1769 (rlen > flen || 1770 rbno + rlen > fbno + flen))) { 1771 error = -EFSCORRUPTED; 1772 goto error0; 1773 } 1774 if (rlen < args->maxlen) { 1775 xfs_agblock_t bestfbno; 1776 xfs_extlen_t bestflen; 1777 xfs_agblock_t bestrbno; 1778 xfs_extlen_t bestrlen; 1779 1780 bestrlen = rlen; 1781 bestrbno = rbno; 1782 bestflen = flen; 1783 bestfbno = fbno; 1784 for (;;) { 1785 if ((error = xfs_btree_decrement(cnt_cur, 0, &i))) 1786 goto error0; 1787 if (i == 0) 1788 break; 1789 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, 1790 &i))) 1791 goto error0; 1792 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1793 error = -EFSCORRUPTED; 1794 goto error0; 1795 } 1796 if (flen < bestrlen) 1797 break; 1798 busy = xfs_alloc_compute_aligned(args, fbno, flen, 1799 &rbno, &rlen, &busy_gen); 1800 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1801 if (XFS_IS_CORRUPT(args->mp, 1802 rlen != 0 && 1803 (rlen > flen || 1804 rbno + rlen > fbno + flen))) { 1805 error = -EFSCORRUPTED; 1806 goto error0; 1807 } 1808 if (rlen > bestrlen) { 1809 bestrlen = rlen; 1810 bestrbno = rbno; 1811 bestflen = flen; 1812 bestfbno = fbno; 1813 if (rlen == args->maxlen) 1814 break; 1815 } 1816 } 1817 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, 1818 &i))) 1819 goto error0; 1820 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1821 error = -EFSCORRUPTED; 1822 goto error0; 1823 } 1824 rlen = bestrlen; 1825 rbno = bestrbno; 1826 flen = bestflen; 1827 fbno = bestfbno; 1828 } 1829 args->wasfromfl = 0; 1830 /* 1831 * Fix up the length. 1832 */ 1833 args->len = rlen; 1834 if (rlen < args->minlen) { 1835 if (busy) { 1836 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1837 trace_xfs_alloc_size_busy(args); 1838 xfs_extent_busy_flush(args->mp, args->pag, busy_gen); 1839 goto restart; 1840 } 1841 goto out_nominleft; 1842 } 1843 xfs_alloc_fix_len(args); 1844 1845 rlen = args->len; 1846 if (XFS_IS_CORRUPT(args->mp, rlen > flen)) { 1847 error = -EFSCORRUPTED; 1848 goto error0; 1849 } 1850 /* 1851 * Allocate and initialize a cursor for the by-block tree. 1852 */ 1853 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1854 args->pag, XFS_BTNUM_BNO); 1855 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, 1856 rbno, rlen, XFSA_FIXUP_CNT_OK))) 1857 goto error0; 1858 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1859 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1860 cnt_cur = bno_cur = NULL; 1861 args->len = rlen; 1862 args->agbno = rbno; 1863 if (XFS_IS_CORRUPT(args->mp, 1864 args->agbno + args->len > 1865 be32_to_cpu(agf->agf_length))) { 1866 error = -EFSCORRUPTED; 1867 goto error0; 1868 } 1869 trace_xfs_alloc_size_done(args); 1870 return 0; 1871 1872 error0: 1873 trace_xfs_alloc_size_error(args); 1874 if (cnt_cur) 1875 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 1876 if (bno_cur) 1877 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1878 return error; 1879 1880 out_nominleft: 1881 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1882 trace_xfs_alloc_size_nominleft(args); 1883 args->agbno = NULLAGBLOCK; 1884 return 0; 1885 } 1886 1887 /* 1888 * Free the extent starting at agno/bno for length. 1889 */ 1890 STATIC int 1891 xfs_free_ag_extent( 1892 struct xfs_trans *tp, 1893 struct xfs_buf *agbp, 1894 xfs_agnumber_t agno, 1895 xfs_agblock_t bno, 1896 xfs_extlen_t len, 1897 const struct xfs_owner_info *oinfo, 1898 enum xfs_ag_resv_type type) 1899 { 1900 struct xfs_mount *mp; 1901 struct xfs_btree_cur *bno_cur; 1902 struct xfs_btree_cur *cnt_cur; 1903 xfs_agblock_t gtbno; /* start of right neighbor */ 1904 xfs_extlen_t gtlen; /* length of right neighbor */ 1905 xfs_agblock_t ltbno; /* start of left neighbor */ 1906 xfs_extlen_t ltlen; /* length of left neighbor */ 1907 xfs_agblock_t nbno; /* new starting block of freesp */ 1908 xfs_extlen_t nlen; /* new length of freespace */ 1909 int haveleft; /* have a left neighbor */ 1910 int haveright; /* have a right neighbor */ 1911 int i; 1912 int error; 1913 struct xfs_perag *pag = agbp->b_pag; 1914 1915 bno_cur = cnt_cur = NULL; 1916 mp = tp->t_mountp; 1917 1918 if (!xfs_rmap_should_skip_owner_update(oinfo)) { 1919 error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo); 1920 if (error) 1921 goto error0; 1922 } 1923 1924 /* 1925 * Allocate and initialize a cursor for the by-block btree. 1926 */ 1927 bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO); 1928 /* 1929 * Look for a neighboring block on the left (lower block numbers) 1930 * that is contiguous with this space. 1931 */ 1932 if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft))) 1933 goto error0; 1934 if (haveleft) { 1935 /* 1936 * There is a block to our left. 1937 */ 1938 if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) 1939 goto error0; 1940 if (XFS_IS_CORRUPT(mp, i != 1)) { 1941 error = -EFSCORRUPTED; 1942 goto error0; 1943 } 1944 /* 1945 * It's not contiguous, though. 1946 */ 1947 if (ltbno + ltlen < bno) 1948 haveleft = 0; 1949 else { 1950 /* 1951 * If this failure happens the request to free this 1952 * space was invalid, it's (partly) already free. 1953 * Very bad. 1954 */ 1955 if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) { 1956 error = -EFSCORRUPTED; 1957 goto error0; 1958 } 1959 } 1960 } 1961 /* 1962 * Look for a neighboring block on the right (higher block numbers) 1963 * that is contiguous with this space. 1964 */ 1965 if ((error = xfs_btree_increment(bno_cur, 0, &haveright))) 1966 goto error0; 1967 if (haveright) { 1968 /* 1969 * There is a block to our right. 1970 */ 1971 if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) 1972 goto error0; 1973 if (XFS_IS_CORRUPT(mp, i != 1)) { 1974 error = -EFSCORRUPTED; 1975 goto error0; 1976 } 1977 /* 1978 * It's not contiguous, though. 1979 */ 1980 if (bno + len < gtbno) 1981 haveright = 0; 1982 else { 1983 /* 1984 * If this failure happens the request to free this 1985 * space was invalid, it's (partly) already free. 1986 * Very bad. 1987 */ 1988 if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) { 1989 error = -EFSCORRUPTED; 1990 goto error0; 1991 } 1992 } 1993 } 1994 /* 1995 * Now allocate and initialize a cursor for the by-size tree. 1996 */ 1997 cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT); 1998 /* 1999 * Have both left and right contiguous neighbors. 2000 * Merge all three into a single free block. 2001 */ 2002 if (haveleft && haveright) { 2003 /* 2004 * Delete the old by-size entry on the left. 2005 */ 2006 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 2007 goto error0; 2008 if (XFS_IS_CORRUPT(mp, i != 1)) { 2009 error = -EFSCORRUPTED; 2010 goto error0; 2011 } 2012 if ((error = xfs_btree_delete(cnt_cur, &i))) 2013 goto error0; 2014 if (XFS_IS_CORRUPT(mp, i != 1)) { 2015 error = -EFSCORRUPTED; 2016 goto error0; 2017 } 2018 /* 2019 * Delete the old by-size entry on the right. 2020 */ 2021 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 2022 goto error0; 2023 if (XFS_IS_CORRUPT(mp, i != 1)) { 2024 error = -EFSCORRUPTED; 2025 goto error0; 2026 } 2027 if ((error = xfs_btree_delete(cnt_cur, &i))) 2028 goto error0; 2029 if (XFS_IS_CORRUPT(mp, i != 1)) { 2030 error = -EFSCORRUPTED; 2031 goto error0; 2032 } 2033 /* 2034 * Delete the old by-block entry for the right block. 2035 */ 2036 if ((error = xfs_btree_delete(bno_cur, &i))) 2037 goto error0; 2038 if (XFS_IS_CORRUPT(mp, i != 1)) { 2039 error = -EFSCORRUPTED; 2040 goto error0; 2041 } 2042 /* 2043 * Move the by-block cursor back to the left neighbor. 2044 */ 2045 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 2046 goto error0; 2047 if (XFS_IS_CORRUPT(mp, i != 1)) { 2048 error = -EFSCORRUPTED; 2049 goto error0; 2050 } 2051 #ifdef DEBUG 2052 /* 2053 * Check that this is the right record: delete didn't 2054 * mangle the cursor. 2055 */ 2056 { 2057 xfs_agblock_t xxbno; 2058 xfs_extlen_t xxlen; 2059 2060 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, 2061 &i))) 2062 goto error0; 2063 if (XFS_IS_CORRUPT(mp, 2064 i != 1 || 2065 xxbno != ltbno || 2066 xxlen != ltlen)) { 2067 error = -EFSCORRUPTED; 2068 goto error0; 2069 } 2070 } 2071 #endif 2072 /* 2073 * Update remaining by-block entry to the new, joined block. 2074 */ 2075 nbno = ltbno; 2076 nlen = len + ltlen + gtlen; 2077 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2078 goto error0; 2079 } 2080 /* 2081 * Have only a left contiguous neighbor. 2082 * Merge it together with the new freespace. 2083 */ 2084 else if (haveleft) { 2085 /* 2086 * Delete the old by-size entry on the left. 2087 */ 2088 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 2089 goto error0; 2090 if (XFS_IS_CORRUPT(mp, i != 1)) { 2091 error = -EFSCORRUPTED; 2092 goto error0; 2093 } 2094 if ((error = xfs_btree_delete(cnt_cur, &i))) 2095 goto error0; 2096 if (XFS_IS_CORRUPT(mp, i != 1)) { 2097 error = -EFSCORRUPTED; 2098 goto error0; 2099 } 2100 /* 2101 * Back up the by-block cursor to the left neighbor, and 2102 * update its length. 2103 */ 2104 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 2105 goto error0; 2106 if (XFS_IS_CORRUPT(mp, i != 1)) { 2107 error = -EFSCORRUPTED; 2108 goto error0; 2109 } 2110 nbno = ltbno; 2111 nlen = len + ltlen; 2112 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2113 goto error0; 2114 } 2115 /* 2116 * Have only a right contiguous neighbor. 2117 * Merge it together with the new freespace. 2118 */ 2119 else if (haveright) { 2120 /* 2121 * Delete the old by-size entry on the right. 2122 */ 2123 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 2124 goto error0; 2125 if (XFS_IS_CORRUPT(mp, i != 1)) { 2126 error = -EFSCORRUPTED; 2127 goto error0; 2128 } 2129 if ((error = xfs_btree_delete(cnt_cur, &i))) 2130 goto error0; 2131 if (XFS_IS_CORRUPT(mp, i != 1)) { 2132 error = -EFSCORRUPTED; 2133 goto error0; 2134 } 2135 /* 2136 * Update the starting block and length of the right 2137 * neighbor in the by-block tree. 2138 */ 2139 nbno = bno; 2140 nlen = len + gtlen; 2141 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2142 goto error0; 2143 } 2144 /* 2145 * No contiguous neighbors. 2146 * Insert the new freespace into the by-block tree. 2147 */ 2148 else { 2149 nbno = bno; 2150 nlen = len; 2151 if ((error = xfs_btree_insert(bno_cur, &i))) 2152 goto error0; 2153 if (XFS_IS_CORRUPT(mp, i != 1)) { 2154 error = -EFSCORRUPTED; 2155 goto error0; 2156 } 2157 } 2158 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 2159 bno_cur = NULL; 2160 /* 2161 * In all cases we need to insert the new freespace in the by-size tree. 2162 */ 2163 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) 2164 goto error0; 2165 if (XFS_IS_CORRUPT(mp, i != 0)) { 2166 error = -EFSCORRUPTED; 2167 goto error0; 2168 } 2169 if ((error = xfs_btree_insert(cnt_cur, &i))) 2170 goto error0; 2171 if (XFS_IS_CORRUPT(mp, i != 1)) { 2172 error = -EFSCORRUPTED; 2173 goto error0; 2174 } 2175 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 2176 cnt_cur = NULL; 2177 2178 /* 2179 * Update the freespace totals in the ag and superblock. 2180 */ 2181 error = xfs_alloc_update_counters(tp, agbp, len); 2182 xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len); 2183 if (error) 2184 goto error0; 2185 2186 XFS_STATS_INC(mp, xs_freex); 2187 XFS_STATS_ADD(mp, xs_freeb, len); 2188 2189 trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright); 2190 2191 return 0; 2192 2193 error0: 2194 trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1); 2195 if (bno_cur) 2196 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 2197 if (cnt_cur) 2198 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 2199 return error; 2200 } 2201 2202 /* 2203 * Visible (exported) allocation/free functions. 2204 * Some of these are used just by xfs_alloc_btree.c and this file. 2205 */ 2206 2207 /* 2208 * Compute and fill in value of m_alloc_maxlevels. 2209 */ 2210 void 2211 xfs_alloc_compute_maxlevels( 2212 xfs_mount_t *mp) /* file system mount structure */ 2213 { 2214 mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr, 2215 (mp->m_sb.sb_agblocks + 1) / 2); 2216 ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk()); 2217 } 2218 2219 /* 2220 * Find the length of the longest extent in an AG. The 'need' parameter 2221 * specifies how much space we're going to need for the AGFL and the 2222 * 'reserved' parameter tells us how many blocks in this AG are reserved for 2223 * other callers. 2224 */ 2225 xfs_extlen_t 2226 xfs_alloc_longest_free_extent( 2227 struct xfs_perag *pag, 2228 xfs_extlen_t need, 2229 xfs_extlen_t reserved) 2230 { 2231 xfs_extlen_t delta = 0; 2232 2233 /* 2234 * If the AGFL needs a recharge, we'll have to subtract that from the 2235 * longest extent. 2236 */ 2237 if (need > pag->pagf_flcount) 2238 delta = need - pag->pagf_flcount; 2239 2240 /* 2241 * If we cannot maintain others' reservations with space from the 2242 * not-longest freesp extents, we'll have to subtract /that/ from 2243 * the longest extent too. 2244 */ 2245 if (pag->pagf_freeblks - pag->pagf_longest < reserved) 2246 delta += reserved - (pag->pagf_freeblks - pag->pagf_longest); 2247 2248 /* 2249 * If the longest extent is long enough to satisfy all the 2250 * reservations and AGFL rules in place, we can return this extent. 2251 */ 2252 if (pag->pagf_longest > delta) 2253 return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable, 2254 pag->pagf_longest - delta); 2255 2256 /* Otherwise, let the caller try for 1 block if there's space. */ 2257 return pag->pagf_flcount > 0 || pag->pagf_longest > 0; 2258 } 2259 2260 /* 2261 * Compute the minimum length of the AGFL in the given AG. If @pag is NULL, 2262 * return the largest possible minimum length. 2263 */ 2264 unsigned int 2265 xfs_alloc_min_freelist( 2266 struct xfs_mount *mp, 2267 struct xfs_perag *pag) 2268 { 2269 /* AG btrees have at least 1 level. */ 2270 static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1}; 2271 const uint8_t *levels = pag ? pag->pagf_levels : fake_levels; 2272 unsigned int min_free; 2273 2274 ASSERT(mp->m_alloc_maxlevels > 0); 2275 2276 /* space needed by-bno freespace btree */ 2277 min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, 2278 mp->m_alloc_maxlevels); 2279 /* space needed by-size freespace btree */ 2280 min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, 2281 mp->m_alloc_maxlevels); 2282 /* space needed reverse mapping used space btree */ 2283 if (xfs_has_rmapbt(mp)) 2284 min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, 2285 mp->m_rmap_maxlevels); 2286 2287 return min_free; 2288 } 2289 2290 /* 2291 * Check if the operation we are fixing up the freelist for should go ahead or 2292 * not. If we are freeing blocks, we always allow it, otherwise the allocation 2293 * is dependent on whether the size and shape of free space available will 2294 * permit the requested allocation to take place. 2295 */ 2296 static bool 2297 xfs_alloc_space_available( 2298 struct xfs_alloc_arg *args, 2299 xfs_extlen_t min_free, 2300 int flags) 2301 { 2302 struct xfs_perag *pag = args->pag; 2303 xfs_extlen_t alloc_len, longest; 2304 xfs_extlen_t reservation; /* blocks that are still reserved */ 2305 int available; 2306 xfs_extlen_t agflcount; 2307 2308 if (flags & XFS_ALLOC_FLAG_FREEING) 2309 return true; 2310 2311 reservation = xfs_ag_resv_needed(pag, args->resv); 2312 2313 /* do we have enough contiguous free space for the allocation? */ 2314 alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop; 2315 longest = xfs_alloc_longest_free_extent(pag, min_free, reservation); 2316 if (longest < alloc_len) 2317 return false; 2318 2319 /* 2320 * Do we have enough free space remaining for the allocation? Don't 2321 * account extra agfl blocks because we are about to defer free them, 2322 * making them unavailable until the current transaction commits. 2323 */ 2324 agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free); 2325 available = (int)(pag->pagf_freeblks + agflcount - 2326 reservation - min_free - args->minleft); 2327 if (available < (int)max(args->total, alloc_len)) 2328 return false; 2329 2330 /* 2331 * Clamp maxlen to the amount of free space available for the actual 2332 * extent allocation. 2333 */ 2334 if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) { 2335 args->maxlen = available; 2336 ASSERT(args->maxlen > 0); 2337 ASSERT(args->maxlen >= args->minlen); 2338 } 2339 2340 return true; 2341 } 2342 2343 int 2344 xfs_free_agfl_block( 2345 struct xfs_trans *tp, 2346 xfs_agnumber_t agno, 2347 xfs_agblock_t agbno, 2348 struct xfs_buf *agbp, 2349 struct xfs_owner_info *oinfo) 2350 { 2351 int error; 2352 struct xfs_buf *bp; 2353 2354 error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo, 2355 XFS_AG_RESV_AGFL); 2356 if (error) 2357 return error; 2358 2359 error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp, 2360 XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno), 2361 tp->t_mountp->m_bsize, 0, &bp); 2362 if (error) 2363 return error; 2364 xfs_trans_binval(tp, bp); 2365 2366 return 0; 2367 } 2368 2369 /* 2370 * Check the agfl fields of the agf for inconsistency or corruption. The purpose 2371 * is to detect an agfl header padding mismatch between current and early v5 2372 * kernels. This problem manifests as a 1-slot size difference between the 2373 * on-disk flcount and the active [first, last] range of a wrapped agfl. This 2374 * may also catch variants of agfl count corruption unrelated to padding. Either 2375 * way, we'll reset the agfl and warn the user. 2376 * 2377 * Return true if a reset is required before the agfl can be used, false 2378 * otherwise. 2379 */ 2380 static bool 2381 xfs_agfl_needs_reset( 2382 struct xfs_mount *mp, 2383 struct xfs_agf *agf) 2384 { 2385 uint32_t f = be32_to_cpu(agf->agf_flfirst); 2386 uint32_t l = be32_to_cpu(agf->agf_fllast); 2387 uint32_t c = be32_to_cpu(agf->agf_flcount); 2388 int agfl_size = xfs_agfl_size(mp); 2389 int active; 2390 2391 /* no agfl header on v4 supers */ 2392 if (!xfs_has_crc(mp)) 2393 return false; 2394 2395 /* 2396 * The agf read verifier catches severe corruption of these fields. 2397 * Repeat some sanity checks to cover a packed -> unpacked mismatch if 2398 * the verifier allows it. 2399 */ 2400 if (f >= agfl_size || l >= agfl_size) 2401 return true; 2402 if (c > agfl_size) 2403 return true; 2404 2405 /* 2406 * Check consistency between the on-disk count and the active range. An 2407 * agfl padding mismatch manifests as an inconsistent flcount. 2408 */ 2409 if (c && l >= f) 2410 active = l - f + 1; 2411 else if (c) 2412 active = agfl_size - f + l + 1; 2413 else 2414 active = 0; 2415 2416 return active != c; 2417 } 2418 2419 /* 2420 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the 2421 * agfl content cannot be trusted. Warn the user that a repair is required to 2422 * recover leaked blocks. 2423 * 2424 * The purpose of this mechanism is to handle filesystems affected by the agfl 2425 * header padding mismatch problem. A reset keeps the filesystem online with a 2426 * relatively minor free space accounting inconsistency rather than suffer the 2427 * inevitable crash from use of an invalid agfl block. 2428 */ 2429 static void 2430 xfs_agfl_reset( 2431 struct xfs_trans *tp, 2432 struct xfs_buf *agbp, 2433 struct xfs_perag *pag) 2434 { 2435 struct xfs_mount *mp = tp->t_mountp; 2436 struct xfs_agf *agf = agbp->b_addr; 2437 2438 ASSERT(pag->pagf_agflreset); 2439 trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); 2440 2441 xfs_warn(mp, 2442 "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. " 2443 "Please unmount and run xfs_repair.", 2444 pag->pag_agno, pag->pagf_flcount); 2445 2446 agf->agf_flfirst = 0; 2447 agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1); 2448 agf->agf_flcount = 0; 2449 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST | 2450 XFS_AGF_FLCOUNT); 2451 2452 pag->pagf_flcount = 0; 2453 pag->pagf_agflreset = false; 2454 } 2455 2456 /* 2457 * Defer an AGFL block free. This is effectively equivalent to 2458 * xfs_free_extent_later() with some special handling particular to AGFL blocks. 2459 * 2460 * Deferring AGFL frees helps prevent log reservation overruns due to too many 2461 * allocation operations in a transaction. AGFL frees are prone to this problem 2462 * because for one they are always freed one at a time. Further, an immediate 2463 * AGFL block free can cause a btree join and require another block free before 2464 * the real allocation can proceed. Deferring the free disconnects freeing up 2465 * the AGFL slot from freeing the block. 2466 */ 2467 STATIC void 2468 xfs_defer_agfl_block( 2469 struct xfs_trans *tp, 2470 xfs_agnumber_t agno, 2471 xfs_fsblock_t agbno, 2472 struct xfs_owner_info *oinfo) 2473 { 2474 struct xfs_mount *mp = tp->t_mountp; 2475 struct xfs_extent_free_item *new; /* new element */ 2476 2477 ASSERT(xfs_extfree_item_cache != NULL); 2478 ASSERT(oinfo != NULL); 2479 2480 new = kmem_cache_zalloc(xfs_extfree_item_cache, 2481 GFP_KERNEL | __GFP_NOFAIL); 2482 new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); 2483 new->xefi_blockcount = 1; 2484 new->xefi_owner = oinfo->oi_owner; 2485 2486 trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); 2487 2488 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list); 2489 } 2490 2491 /* 2492 * Add the extent to the list of extents to be free at transaction end. 2493 * The list is maintained sorted (by block number). 2494 */ 2495 void 2496 __xfs_free_extent_later( 2497 struct xfs_trans *tp, 2498 xfs_fsblock_t bno, 2499 xfs_filblks_t len, 2500 const struct xfs_owner_info *oinfo, 2501 bool skip_discard) 2502 { 2503 struct xfs_extent_free_item *new; /* new element */ 2504 #ifdef DEBUG 2505 struct xfs_mount *mp = tp->t_mountp; 2506 xfs_agnumber_t agno; 2507 xfs_agblock_t agbno; 2508 2509 ASSERT(bno != NULLFSBLOCK); 2510 ASSERT(len > 0); 2511 ASSERT(len <= XFS_MAX_BMBT_EXTLEN); 2512 ASSERT(!isnullstartblock(bno)); 2513 agno = XFS_FSB_TO_AGNO(mp, bno); 2514 agbno = XFS_FSB_TO_AGBNO(mp, bno); 2515 ASSERT(agno < mp->m_sb.sb_agcount); 2516 ASSERT(agbno < mp->m_sb.sb_agblocks); 2517 ASSERT(len < mp->m_sb.sb_agblocks); 2518 ASSERT(agbno + len <= mp->m_sb.sb_agblocks); 2519 #endif 2520 ASSERT(xfs_extfree_item_cache != NULL); 2521 2522 new = kmem_cache_zalloc(xfs_extfree_item_cache, 2523 GFP_KERNEL | __GFP_NOFAIL); 2524 new->xefi_startblock = bno; 2525 new->xefi_blockcount = (xfs_extlen_t)len; 2526 if (skip_discard) 2527 new->xefi_flags |= XFS_EFI_SKIP_DISCARD; 2528 if (oinfo) { 2529 ASSERT(oinfo->oi_offset == 0); 2530 2531 if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) 2532 new->xefi_flags |= XFS_EFI_ATTR_FORK; 2533 if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) 2534 new->xefi_flags |= XFS_EFI_BMBT_BLOCK; 2535 new->xefi_owner = oinfo->oi_owner; 2536 } else { 2537 new->xefi_owner = XFS_RMAP_OWN_NULL; 2538 } 2539 trace_xfs_bmap_free_defer(tp->t_mountp, 2540 XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, 2541 XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); 2542 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list); 2543 } 2544 2545 #ifdef DEBUG 2546 /* 2547 * Check if an AGF has a free extent record whose length is equal to 2548 * args->minlen. 2549 */ 2550 STATIC int 2551 xfs_exact_minlen_extent_available( 2552 struct xfs_alloc_arg *args, 2553 struct xfs_buf *agbp, 2554 int *stat) 2555 { 2556 struct xfs_btree_cur *cnt_cur; 2557 xfs_agblock_t fbno; 2558 xfs_extlen_t flen; 2559 int error = 0; 2560 2561 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp, 2562 args->pag, XFS_BTNUM_CNT); 2563 error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat); 2564 if (error) 2565 goto out; 2566 2567 if (*stat == 0) { 2568 error = -EFSCORRUPTED; 2569 goto out; 2570 } 2571 2572 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat); 2573 if (error) 2574 goto out; 2575 2576 if (*stat == 1 && flen != args->minlen) 2577 *stat = 0; 2578 2579 out: 2580 xfs_btree_del_cursor(cnt_cur, error); 2581 2582 return error; 2583 } 2584 #endif 2585 2586 /* 2587 * Decide whether to use this allocation group for this allocation. 2588 * If so, fix up the btree freelist's size. 2589 */ 2590 int /* error */ 2591 xfs_alloc_fix_freelist( 2592 struct xfs_alloc_arg *args, /* allocation argument structure */ 2593 int flags) /* XFS_ALLOC_FLAG_... */ 2594 { 2595 struct xfs_mount *mp = args->mp; 2596 struct xfs_perag *pag = args->pag; 2597 struct xfs_trans *tp = args->tp; 2598 struct xfs_buf *agbp = NULL; 2599 struct xfs_buf *agflbp = NULL; 2600 struct xfs_alloc_arg targs; /* local allocation arguments */ 2601 xfs_agblock_t bno; /* freelist block */ 2602 xfs_extlen_t need; /* total blocks needed in freelist */ 2603 int error = 0; 2604 2605 /* deferred ops (AGFL block frees) require permanent transactions */ 2606 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 2607 2608 if (!pag->pagf_init) { 2609 error = xfs_alloc_read_agf(pag, tp, flags, &agbp); 2610 if (error) { 2611 /* Couldn't lock the AGF so skip this AG. */ 2612 if (error == -EAGAIN) 2613 error = 0; 2614 goto out_no_agbp; 2615 } 2616 } 2617 2618 /* 2619 * If this is a metadata preferred pag and we are user data then try 2620 * somewhere else if we are not being asked to try harder at this 2621 * point 2622 */ 2623 if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) && 2624 (flags & XFS_ALLOC_FLAG_TRYLOCK)) { 2625 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); 2626 goto out_agbp_relse; 2627 } 2628 2629 need = xfs_alloc_min_freelist(mp, pag); 2630 if (!xfs_alloc_space_available(args, need, flags | 2631 XFS_ALLOC_FLAG_CHECK)) 2632 goto out_agbp_relse; 2633 2634 /* 2635 * Get the a.g. freespace buffer. 2636 * Can fail if we're not blocking on locks, and it's held. 2637 */ 2638 if (!agbp) { 2639 error = xfs_alloc_read_agf(pag, tp, flags, &agbp); 2640 if (error) { 2641 /* Couldn't lock the AGF so skip this AG. */ 2642 if (error == -EAGAIN) 2643 error = 0; 2644 goto out_no_agbp; 2645 } 2646 } 2647 2648 /* reset a padding mismatched agfl before final free space check */ 2649 if (pag->pagf_agflreset) 2650 xfs_agfl_reset(tp, agbp, pag); 2651 2652 /* If there isn't enough total space or single-extent, reject it. */ 2653 need = xfs_alloc_min_freelist(mp, pag); 2654 if (!xfs_alloc_space_available(args, need, flags)) 2655 goto out_agbp_relse; 2656 2657 #ifdef DEBUG 2658 if (args->alloc_minlen_only) { 2659 int stat; 2660 2661 error = xfs_exact_minlen_extent_available(args, agbp, &stat); 2662 if (error || !stat) 2663 goto out_agbp_relse; 2664 } 2665 #endif 2666 /* 2667 * Make the freelist shorter if it's too long. 2668 * 2669 * Note that from this point onwards, we will always release the agf and 2670 * agfl buffers on error. This handles the case where we error out and 2671 * the buffers are clean or may not have been joined to the transaction 2672 * and hence need to be released manually. If they have been joined to 2673 * the transaction, then xfs_trans_brelse() will handle them 2674 * appropriately based on the recursion count and dirty state of the 2675 * buffer. 2676 * 2677 * XXX (dgc): When we have lots of free space, does this buy us 2678 * anything other than extra overhead when we need to put more blocks 2679 * back on the free list? Maybe we should only do this when space is 2680 * getting low or the AGFL is more than half full? 2681 * 2682 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too 2683 * big; the NORMAP flag prevents AGFL expand/shrink operations from 2684 * updating the rmapbt. Both flags are used in xfs_repair while we're 2685 * rebuilding the rmapbt, and neither are used by the kernel. They're 2686 * both required to ensure that rmaps are correctly recorded for the 2687 * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and 2688 * repair/rmap.c in xfsprogs for details. 2689 */ 2690 memset(&targs, 0, sizeof(targs)); 2691 /* struct copy below */ 2692 if (flags & XFS_ALLOC_FLAG_NORMAP) 2693 targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; 2694 else 2695 targs.oinfo = XFS_RMAP_OINFO_AG; 2696 while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { 2697 error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0); 2698 if (error) 2699 goto out_agbp_relse; 2700 2701 /* defer agfl frees */ 2702 xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); 2703 } 2704 2705 targs.tp = tp; 2706 targs.mp = mp; 2707 targs.agbp = agbp; 2708 targs.agno = args->agno; 2709 targs.alignment = targs.minlen = targs.prod = 1; 2710 targs.type = XFS_ALLOCTYPE_THIS_AG; 2711 targs.pag = pag; 2712 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 2713 if (error) 2714 goto out_agbp_relse; 2715 2716 /* Make the freelist longer if it's too short. */ 2717 while (pag->pagf_flcount < need) { 2718 targs.agbno = 0; 2719 targs.maxlen = need - pag->pagf_flcount; 2720 targs.resv = XFS_AG_RESV_AGFL; 2721 2722 /* Allocate as many blocks as possible at once. */ 2723 error = xfs_alloc_ag_vextent(&targs); 2724 if (error) 2725 goto out_agflbp_relse; 2726 2727 /* 2728 * Stop if we run out. Won't happen if callers are obeying 2729 * the restrictions correctly. Can happen for free calls 2730 * on a completely full ag. 2731 */ 2732 if (targs.agbno == NULLAGBLOCK) { 2733 if (flags & XFS_ALLOC_FLAG_FREEING) 2734 break; 2735 goto out_agflbp_relse; 2736 } 2737 /* 2738 * Put each allocated block on the list. 2739 */ 2740 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { 2741 error = xfs_alloc_put_freelist(pag, tp, agbp, 2742 agflbp, bno, 0); 2743 if (error) 2744 goto out_agflbp_relse; 2745 } 2746 } 2747 xfs_trans_brelse(tp, agflbp); 2748 args->agbp = agbp; 2749 return 0; 2750 2751 out_agflbp_relse: 2752 xfs_trans_brelse(tp, agflbp); 2753 out_agbp_relse: 2754 if (agbp) 2755 xfs_trans_brelse(tp, agbp); 2756 out_no_agbp: 2757 args->agbp = NULL; 2758 return error; 2759 } 2760 2761 /* 2762 * Get a block from the freelist. 2763 * Returns with the buffer for the block gotten. 2764 */ 2765 int 2766 xfs_alloc_get_freelist( 2767 struct xfs_perag *pag, 2768 struct xfs_trans *tp, 2769 struct xfs_buf *agbp, 2770 xfs_agblock_t *bnop, 2771 int btreeblk) 2772 { 2773 struct xfs_agf *agf = agbp->b_addr; 2774 struct xfs_buf *agflbp; 2775 xfs_agblock_t bno; 2776 __be32 *agfl_bno; 2777 int error; 2778 uint32_t logflags; 2779 struct xfs_mount *mp = tp->t_mountp; 2780 2781 /* 2782 * Freelist is empty, give up. 2783 */ 2784 if (!agf->agf_flcount) { 2785 *bnop = NULLAGBLOCK; 2786 return 0; 2787 } 2788 /* 2789 * Read the array of free blocks. 2790 */ 2791 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 2792 if (error) 2793 return error; 2794 2795 2796 /* 2797 * Get the block number and update the data structures. 2798 */ 2799 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 2800 bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); 2801 be32_add_cpu(&agf->agf_flfirst, 1); 2802 xfs_trans_brelse(tp, agflbp); 2803 if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) 2804 agf->agf_flfirst = 0; 2805 2806 ASSERT(!pag->pagf_agflreset); 2807 be32_add_cpu(&agf->agf_flcount, -1); 2808 pag->pagf_flcount--; 2809 2810 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; 2811 if (btreeblk) { 2812 be32_add_cpu(&agf->agf_btreeblks, 1); 2813 pag->pagf_btreeblks++; 2814 logflags |= XFS_AGF_BTREEBLKS; 2815 } 2816 2817 xfs_alloc_log_agf(tp, agbp, logflags); 2818 *bnop = bno; 2819 2820 return 0; 2821 } 2822 2823 /* 2824 * Log the given fields from the agf structure. 2825 */ 2826 void 2827 xfs_alloc_log_agf( 2828 struct xfs_trans *tp, 2829 struct xfs_buf *bp, 2830 uint32_t fields) 2831 { 2832 int first; /* first byte offset */ 2833 int last; /* last byte offset */ 2834 static const short offsets[] = { 2835 offsetof(xfs_agf_t, agf_magicnum), 2836 offsetof(xfs_agf_t, agf_versionnum), 2837 offsetof(xfs_agf_t, agf_seqno), 2838 offsetof(xfs_agf_t, agf_length), 2839 offsetof(xfs_agf_t, agf_roots[0]), 2840 offsetof(xfs_agf_t, agf_levels[0]), 2841 offsetof(xfs_agf_t, agf_flfirst), 2842 offsetof(xfs_agf_t, agf_fllast), 2843 offsetof(xfs_agf_t, agf_flcount), 2844 offsetof(xfs_agf_t, agf_freeblks), 2845 offsetof(xfs_agf_t, agf_longest), 2846 offsetof(xfs_agf_t, agf_btreeblks), 2847 offsetof(xfs_agf_t, agf_uuid), 2848 offsetof(xfs_agf_t, agf_rmap_blocks), 2849 offsetof(xfs_agf_t, agf_refcount_blocks), 2850 offsetof(xfs_agf_t, agf_refcount_root), 2851 offsetof(xfs_agf_t, agf_refcount_level), 2852 /* needed so that we don't log the whole rest of the structure: */ 2853 offsetof(xfs_agf_t, agf_spare64), 2854 sizeof(xfs_agf_t) 2855 }; 2856 2857 trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_); 2858 2859 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); 2860 2861 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); 2862 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); 2863 } 2864 2865 /* 2866 * Put the block on the freelist for the allocation group. 2867 */ 2868 int 2869 xfs_alloc_put_freelist( 2870 struct xfs_perag *pag, 2871 struct xfs_trans *tp, 2872 struct xfs_buf *agbp, 2873 struct xfs_buf *agflbp, 2874 xfs_agblock_t bno, 2875 int btreeblk) 2876 { 2877 struct xfs_mount *mp = tp->t_mountp; 2878 struct xfs_agf *agf = agbp->b_addr; 2879 __be32 *blockp; 2880 int error; 2881 uint32_t logflags; 2882 __be32 *agfl_bno; 2883 int startoff; 2884 2885 if (!agflbp) { 2886 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 2887 if (error) 2888 return error; 2889 } 2890 2891 be32_add_cpu(&agf->agf_fllast, 1); 2892 if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) 2893 agf->agf_fllast = 0; 2894 2895 ASSERT(!pag->pagf_agflreset); 2896 be32_add_cpu(&agf->agf_flcount, 1); 2897 pag->pagf_flcount++; 2898 2899 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; 2900 if (btreeblk) { 2901 be32_add_cpu(&agf->agf_btreeblks, -1); 2902 pag->pagf_btreeblks--; 2903 logflags |= XFS_AGF_BTREEBLKS; 2904 } 2905 2906 xfs_alloc_log_agf(tp, agbp, logflags); 2907 2908 ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); 2909 2910 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 2911 blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; 2912 *blockp = cpu_to_be32(bno); 2913 startoff = (char *)blockp - (char *)agflbp->b_addr; 2914 2915 xfs_alloc_log_agf(tp, agbp, logflags); 2916 2917 xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF); 2918 xfs_trans_log_buf(tp, agflbp, startoff, 2919 startoff + sizeof(xfs_agblock_t) - 1); 2920 return 0; 2921 } 2922 2923 static xfs_failaddr_t 2924 xfs_agf_verify( 2925 struct xfs_buf *bp) 2926 { 2927 struct xfs_mount *mp = bp->b_mount; 2928 struct xfs_agf *agf = bp->b_addr; 2929 2930 if (xfs_has_crc(mp)) { 2931 if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) 2932 return __this_address; 2933 if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn))) 2934 return __this_address; 2935 } 2936 2937 if (!xfs_verify_magic(bp, agf->agf_magicnum)) 2938 return __this_address; 2939 2940 if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && 2941 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && 2942 be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) && 2943 be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) && 2944 be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp))) 2945 return __this_address; 2946 2947 if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks) 2948 return __this_address; 2949 2950 if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) || 2951 be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length)) 2952 return __this_address; 2953 2954 if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || 2955 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || 2956 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > 2957 mp->m_alloc_maxlevels || 2958 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > 2959 mp->m_alloc_maxlevels) 2960 return __this_address; 2961 2962 if (xfs_has_rmapbt(mp) && 2963 (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || 2964 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > 2965 mp->m_rmap_maxlevels)) 2966 return __this_address; 2967 2968 if (xfs_has_rmapbt(mp) && 2969 be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length)) 2970 return __this_address; 2971 2972 /* 2973 * during growfs operations, the perag is not fully initialised, 2974 * so we can't use it for any useful checking. growfs ensures we can't 2975 * use it by using uncached buffers that don't have the perag attached 2976 * so we can detect and avoid this problem. 2977 */ 2978 if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) 2979 return __this_address; 2980 2981 if (xfs_has_lazysbcount(mp) && 2982 be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) 2983 return __this_address; 2984 2985 if (xfs_has_reflink(mp) && 2986 be32_to_cpu(agf->agf_refcount_blocks) > 2987 be32_to_cpu(agf->agf_length)) 2988 return __this_address; 2989 2990 if (xfs_has_reflink(mp) && 2991 (be32_to_cpu(agf->agf_refcount_level) < 1 || 2992 be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels)) 2993 return __this_address; 2994 2995 return NULL; 2996 2997 } 2998 2999 static void 3000 xfs_agf_read_verify( 3001 struct xfs_buf *bp) 3002 { 3003 struct xfs_mount *mp = bp->b_mount; 3004 xfs_failaddr_t fa; 3005 3006 if (xfs_has_crc(mp) && 3007 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) 3008 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 3009 else { 3010 fa = xfs_agf_verify(bp); 3011 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF)) 3012 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 3013 } 3014 } 3015 3016 static void 3017 xfs_agf_write_verify( 3018 struct xfs_buf *bp) 3019 { 3020 struct xfs_mount *mp = bp->b_mount; 3021 struct xfs_buf_log_item *bip = bp->b_log_item; 3022 struct xfs_agf *agf = bp->b_addr; 3023 xfs_failaddr_t fa; 3024 3025 fa = xfs_agf_verify(bp); 3026 if (fa) { 3027 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 3028 return; 3029 } 3030 3031 if (!xfs_has_crc(mp)) 3032 return; 3033 3034 if (bip) 3035 agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); 3036 3037 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); 3038 } 3039 3040 const struct xfs_buf_ops xfs_agf_buf_ops = { 3041 .name = "xfs_agf", 3042 .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) }, 3043 .verify_read = xfs_agf_read_verify, 3044 .verify_write = xfs_agf_write_verify, 3045 .verify_struct = xfs_agf_verify, 3046 }; 3047 3048 /* 3049 * Read in the allocation group header (free/alloc section). 3050 */ 3051 int 3052 xfs_read_agf( 3053 struct xfs_perag *pag, 3054 struct xfs_trans *tp, 3055 int flags, 3056 struct xfs_buf **agfbpp) 3057 { 3058 struct xfs_mount *mp = pag->pag_mount; 3059 int error; 3060 3061 trace_xfs_read_agf(pag->pag_mount, pag->pag_agno); 3062 3063 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 3064 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)), 3065 XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); 3066 if (error) 3067 return error; 3068 3069 xfs_buf_set_ref(*agfbpp, XFS_AGF_REF); 3070 return 0; 3071 } 3072 3073 /* 3074 * Read in the allocation group header (free/alloc section) and initialise the 3075 * perag structure if necessary. If the caller provides @agfbpp, then return the 3076 * locked buffer to the caller, otherwise free it. 3077 */ 3078 int 3079 xfs_alloc_read_agf( 3080 struct xfs_perag *pag, 3081 struct xfs_trans *tp, 3082 int flags, 3083 struct xfs_buf **agfbpp) 3084 { 3085 struct xfs_buf *agfbp; 3086 struct xfs_agf *agf; 3087 int error; 3088 int allocbt_blks; 3089 3090 trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno); 3091 3092 /* We don't support trylock when freeing. */ 3093 ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != 3094 (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)); 3095 error = xfs_read_agf(pag, tp, 3096 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, 3097 &agfbp); 3098 if (error) 3099 return error; 3100 3101 agf = agfbp->b_addr; 3102 if (!pag->pagf_init) { 3103 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); 3104 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); 3105 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); 3106 pag->pagf_longest = be32_to_cpu(agf->agf_longest); 3107 pag->pagf_levels[XFS_BTNUM_BNOi] = 3108 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); 3109 pag->pagf_levels[XFS_BTNUM_CNTi] = 3110 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); 3111 pag->pagf_levels[XFS_BTNUM_RMAPi] = 3112 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); 3113 pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); 3114 pag->pagf_init = 1; 3115 pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf); 3116 3117 /* 3118 * Update the in-core allocbt counter. Filter out the rmapbt 3119 * subset of the btreeblks counter because the rmapbt is managed 3120 * by perag reservation. Subtract one for the rmapbt root block 3121 * because the rmap counter includes it while the btreeblks 3122 * counter only tracks non-root blocks. 3123 */ 3124 allocbt_blks = pag->pagf_btreeblks; 3125 if (xfs_has_rmapbt(pag->pag_mount)) 3126 allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; 3127 if (allocbt_blks > 0) 3128 atomic64_add(allocbt_blks, 3129 &pag->pag_mount->m_allocbt_blks); 3130 } 3131 #ifdef DEBUG 3132 else if (!xfs_is_shutdown(pag->pag_mount)) { 3133 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); 3134 ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); 3135 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); 3136 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); 3137 ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] == 3138 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi])); 3139 ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] == 3140 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); 3141 } 3142 #endif 3143 if (agfbpp) 3144 *agfbpp = agfbp; 3145 else 3146 xfs_trans_brelse(tp, agfbp); 3147 return 0; 3148 } 3149 3150 /* 3151 * Allocate an extent (variable-size). 3152 * Depending on the allocation type, we either look in a single allocation 3153 * group or loop over the allocation groups to find the result. 3154 */ 3155 int /* error */ 3156 xfs_alloc_vextent( 3157 struct xfs_alloc_arg *args) /* allocation argument structure */ 3158 { 3159 xfs_agblock_t agsize; /* allocation group size */ 3160 int error; 3161 int flags; /* XFS_ALLOC_FLAG_... locking flags */ 3162 struct xfs_mount *mp; /* mount structure pointer */ 3163 xfs_agnumber_t sagno; /* starting allocation group number */ 3164 xfs_alloctype_t type; /* input allocation type */ 3165 int bump_rotor = 0; 3166 xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */ 3167 3168 mp = args->mp; 3169 type = args->otype = args->type; 3170 args->agbno = NULLAGBLOCK; 3171 /* 3172 * Just fix this up, for the case where the last a.g. is shorter 3173 * (or there's only one a.g.) and the caller couldn't easily figure 3174 * that out (xfs_bmap_alloc). 3175 */ 3176 agsize = mp->m_sb.sb_agblocks; 3177 if (args->maxlen > agsize) 3178 args->maxlen = agsize; 3179 if (args->alignment == 0) 3180 args->alignment = 1; 3181 ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount); 3182 ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize); 3183 ASSERT(args->minlen <= args->maxlen); 3184 ASSERT(args->minlen <= agsize); 3185 ASSERT(args->mod < args->prod); 3186 if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount || 3187 XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize || 3188 args->minlen > args->maxlen || args->minlen > agsize || 3189 args->mod >= args->prod) { 3190 args->fsbno = NULLFSBLOCK; 3191 trace_xfs_alloc_vextent_badargs(args); 3192 return 0; 3193 } 3194 3195 switch (type) { 3196 case XFS_ALLOCTYPE_THIS_AG: 3197 case XFS_ALLOCTYPE_NEAR_BNO: 3198 case XFS_ALLOCTYPE_THIS_BNO: 3199 /* 3200 * These three force us into a single a.g. 3201 */ 3202 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); 3203 args->pag = xfs_perag_get(mp, args->agno); 3204 error = xfs_alloc_fix_freelist(args, 0); 3205 if (error) { 3206 trace_xfs_alloc_vextent_nofix(args); 3207 goto error0; 3208 } 3209 if (!args->agbp) { 3210 trace_xfs_alloc_vextent_noagbp(args); 3211 break; 3212 } 3213 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); 3214 if ((error = xfs_alloc_ag_vextent(args))) 3215 goto error0; 3216 break; 3217 case XFS_ALLOCTYPE_START_BNO: 3218 /* 3219 * Try near allocation first, then anywhere-in-ag after 3220 * the first a.g. fails. 3221 */ 3222 if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && 3223 xfs_is_inode32(mp)) { 3224 args->fsbno = XFS_AGB_TO_FSB(mp, 3225 ((mp->m_agfrotor / rotorstep) % 3226 mp->m_sb.sb_agcount), 0); 3227 bump_rotor = 1; 3228 } 3229 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno); 3230 args->type = XFS_ALLOCTYPE_NEAR_BNO; 3231 fallthrough; 3232 case XFS_ALLOCTYPE_FIRST_AG: 3233 /* 3234 * Rotate through the allocation groups looking for a winner. 3235 */ 3236 if (type == XFS_ALLOCTYPE_FIRST_AG) { 3237 /* 3238 * Start with allocation group given by bno. 3239 */ 3240 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno); 3241 args->type = XFS_ALLOCTYPE_THIS_AG; 3242 sagno = 0; 3243 flags = 0; 3244 } else { 3245 /* 3246 * Start with the given allocation group. 3247 */ 3248 args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno); 3249 flags = XFS_ALLOC_FLAG_TRYLOCK; 3250 } 3251 /* 3252 * Loop over allocation groups twice; first time with 3253 * trylock set, second time without. 3254 */ 3255 for (;;) { 3256 args->pag = xfs_perag_get(mp, args->agno); 3257 error = xfs_alloc_fix_freelist(args, flags); 3258 if (error) { 3259 trace_xfs_alloc_vextent_nofix(args); 3260 goto error0; 3261 } 3262 /* 3263 * If we get a buffer back then the allocation will fly. 3264 */ 3265 if (args->agbp) { 3266 if ((error = xfs_alloc_ag_vextent(args))) 3267 goto error0; 3268 break; 3269 } 3270 3271 trace_xfs_alloc_vextent_loopfailed(args); 3272 3273 /* 3274 * Didn't work, figure out the next iteration. 3275 */ 3276 if (args->agno == sagno && 3277 type == XFS_ALLOCTYPE_START_BNO) 3278 args->type = XFS_ALLOCTYPE_THIS_AG; 3279 /* 3280 * For the first allocation, we can try any AG to get 3281 * space. However, if we already have allocated a 3282 * block, we don't want to try AGs whose number is below 3283 * sagno. Otherwise, we may end up with out-of-order 3284 * locking of AGF, which might cause deadlock. 3285 */ 3286 if (++(args->agno) == mp->m_sb.sb_agcount) { 3287 if (args->tp->t_firstblock != NULLFSBLOCK) 3288 args->agno = sagno; 3289 else 3290 args->agno = 0; 3291 } 3292 /* 3293 * Reached the starting a.g., must either be done 3294 * or switch to non-trylock mode. 3295 */ 3296 if (args->agno == sagno) { 3297 if (flags == 0) { 3298 args->agbno = NULLAGBLOCK; 3299 trace_xfs_alloc_vextent_allfailed(args); 3300 break; 3301 } 3302 3303 flags = 0; 3304 if (type == XFS_ALLOCTYPE_START_BNO) { 3305 args->agbno = XFS_FSB_TO_AGBNO(mp, 3306 args->fsbno); 3307 args->type = XFS_ALLOCTYPE_NEAR_BNO; 3308 } 3309 } 3310 xfs_perag_put(args->pag); 3311 } 3312 if (bump_rotor) { 3313 if (args->agno == sagno) 3314 mp->m_agfrotor = (mp->m_agfrotor + 1) % 3315 (mp->m_sb.sb_agcount * rotorstep); 3316 else 3317 mp->m_agfrotor = (args->agno * rotorstep + 1) % 3318 (mp->m_sb.sb_agcount * rotorstep); 3319 } 3320 break; 3321 default: 3322 ASSERT(0); 3323 /* NOTREACHED */ 3324 } 3325 if (args->agbno == NULLAGBLOCK) 3326 args->fsbno = NULLFSBLOCK; 3327 else { 3328 args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); 3329 #ifdef DEBUG 3330 ASSERT(args->len >= args->minlen); 3331 ASSERT(args->len <= args->maxlen); 3332 ASSERT(args->agbno % args->alignment == 0); 3333 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), 3334 args->len); 3335 #endif 3336 3337 } 3338 xfs_perag_put(args->pag); 3339 return 0; 3340 error0: 3341 xfs_perag_put(args->pag); 3342 return error; 3343 } 3344 3345 /* Ensure that the freelist is at full capacity. */ 3346 int 3347 xfs_free_extent_fix_freelist( 3348 struct xfs_trans *tp, 3349 struct xfs_perag *pag, 3350 struct xfs_buf **agbp) 3351 { 3352 struct xfs_alloc_arg args; 3353 int error; 3354 3355 memset(&args, 0, sizeof(struct xfs_alloc_arg)); 3356 args.tp = tp; 3357 args.mp = tp->t_mountp; 3358 args.agno = pag->pag_agno; 3359 args.pag = pag; 3360 3361 /* 3362 * validate that the block number is legal - the enables us to detect 3363 * and handle a silent filesystem corruption rather than crashing. 3364 */ 3365 if (args.agno >= args.mp->m_sb.sb_agcount) 3366 return -EFSCORRUPTED; 3367 3368 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); 3369 if (error) 3370 return error; 3371 3372 *agbp = args.agbp; 3373 return 0; 3374 } 3375 3376 /* 3377 * Free an extent. 3378 * Just break up the extent address and hand off to xfs_free_ag_extent 3379 * after fixing up the freelist. 3380 */ 3381 int 3382 __xfs_free_extent( 3383 struct xfs_trans *tp, 3384 xfs_fsblock_t bno, 3385 xfs_extlen_t len, 3386 const struct xfs_owner_info *oinfo, 3387 enum xfs_ag_resv_type type, 3388 bool skip_discard) 3389 { 3390 struct xfs_mount *mp = tp->t_mountp; 3391 struct xfs_buf *agbp; 3392 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno); 3393 xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno); 3394 struct xfs_agf *agf; 3395 int error; 3396 unsigned int busy_flags = 0; 3397 struct xfs_perag *pag; 3398 3399 ASSERT(len != 0); 3400 ASSERT(type != XFS_AG_RESV_AGFL); 3401 3402 if (XFS_TEST_ERROR(false, mp, 3403 XFS_ERRTAG_FREE_EXTENT)) 3404 return -EIO; 3405 3406 pag = xfs_perag_get(mp, agno); 3407 error = xfs_free_extent_fix_freelist(tp, pag, &agbp); 3408 if (error) 3409 goto err; 3410 agf = agbp->b_addr; 3411 3412 if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { 3413 error = -EFSCORRUPTED; 3414 goto err_release; 3415 } 3416 3417 /* validate the extent size is legal now we have the agf locked */ 3418 if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { 3419 error = -EFSCORRUPTED; 3420 goto err_release; 3421 } 3422 3423 error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type); 3424 if (error) 3425 goto err_release; 3426 3427 if (skip_discard) 3428 busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; 3429 xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags); 3430 xfs_perag_put(pag); 3431 return 0; 3432 3433 err_release: 3434 xfs_trans_brelse(tp, agbp); 3435 err: 3436 xfs_perag_put(pag); 3437 return error; 3438 } 3439 3440 struct xfs_alloc_query_range_info { 3441 xfs_alloc_query_range_fn fn; 3442 void *priv; 3443 }; 3444 3445 /* Format btree record and pass to our callback. */ 3446 STATIC int 3447 xfs_alloc_query_range_helper( 3448 struct xfs_btree_cur *cur, 3449 const union xfs_btree_rec *rec, 3450 void *priv) 3451 { 3452 struct xfs_alloc_query_range_info *query = priv; 3453 struct xfs_alloc_rec_incore irec; 3454 3455 irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); 3456 irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); 3457 return query->fn(cur, &irec, query->priv); 3458 } 3459 3460 /* Find all free space within a given range of blocks. */ 3461 int 3462 xfs_alloc_query_range( 3463 struct xfs_btree_cur *cur, 3464 const struct xfs_alloc_rec_incore *low_rec, 3465 const struct xfs_alloc_rec_incore *high_rec, 3466 xfs_alloc_query_range_fn fn, 3467 void *priv) 3468 { 3469 union xfs_btree_irec low_brec; 3470 union xfs_btree_irec high_brec; 3471 struct xfs_alloc_query_range_info query; 3472 3473 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); 3474 low_brec.a = *low_rec; 3475 high_brec.a = *high_rec; 3476 query.priv = priv; 3477 query.fn = fn; 3478 return xfs_btree_query_range(cur, &low_brec, &high_brec, 3479 xfs_alloc_query_range_helper, &query); 3480 } 3481 3482 /* Find all free space records. */ 3483 int 3484 xfs_alloc_query_all( 3485 struct xfs_btree_cur *cur, 3486 xfs_alloc_query_range_fn fn, 3487 void *priv) 3488 { 3489 struct xfs_alloc_query_range_info query; 3490 3491 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); 3492 query.priv = priv; 3493 query.fn = fn; 3494 return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); 3495 } 3496 3497 /* Is there a record covering a given extent? */ 3498 int 3499 xfs_alloc_has_record( 3500 struct xfs_btree_cur *cur, 3501 xfs_agblock_t bno, 3502 xfs_extlen_t len, 3503 bool *exists) 3504 { 3505 union xfs_btree_irec low; 3506 union xfs_btree_irec high; 3507 3508 memset(&low, 0, sizeof(low)); 3509 low.a.ar_startblock = bno; 3510 memset(&high, 0xFF, sizeof(high)); 3511 high.a.ar_startblock = bno + len - 1; 3512 3513 return xfs_btree_has_record(cur, &low, &high, exists); 3514 } 3515 3516 /* 3517 * Walk all the blocks in the AGFL. The @walk_fn can return any negative 3518 * error code or XFS_ITER_*. 3519 */ 3520 int 3521 xfs_agfl_walk( 3522 struct xfs_mount *mp, 3523 struct xfs_agf *agf, 3524 struct xfs_buf *agflbp, 3525 xfs_agfl_walk_fn walk_fn, 3526 void *priv) 3527 { 3528 __be32 *agfl_bno; 3529 unsigned int i; 3530 int error; 3531 3532 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 3533 i = be32_to_cpu(agf->agf_flfirst); 3534 3535 /* Nothing to walk in an empty AGFL. */ 3536 if (agf->agf_flcount == cpu_to_be32(0)) 3537 return 0; 3538 3539 /* Otherwise, walk from first to last, wrapping as needed. */ 3540 for (;;) { 3541 error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv); 3542 if (error) 3543 return error; 3544 if (i == be32_to_cpu(agf->agf_fllast)) 3545 break; 3546 if (++i == xfs_agfl_size(mp)) 3547 i = 0; 3548 } 3549 3550 return 0; 3551 } 3552 3553 int __init 3554 xfs_extfree_intent_init_cache(void) 3555 { 3556 xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent", 3557 sizeof(struct xfs_extent_free_item), 3558 0, 0, NULL); 3559 3560 return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM; 3561 } 3562 3563 void 3564 xfs_extfree_intent_destroy_cache(void) 3565 { 3566 kmem_cache_destroy(xfs_extfree_item_cache); 3567 xfs_extfree_item_cache = NULL; 3568 } 3569