1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_shared.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_defer.h" 15 #include "xfs_btree.h" 16 #include "xfs_rmap.h" 17 #include "xfs_alloc_btree.h" 18 #include "xfs_alloc.h" 19 #include "xfs_extent_busy.h" 20 #include "xfs_errortag.h" 21 #include "xfs_error.h" 22 #include "xfs_trace.h" 23 #include "xfs_trans.h" 24 #include "xfs_buf_item.h" 25 #include "xfs_log.h" 26 #include "xfs_ag.h" 27 #include "xfs_ag_resv.h" 28 #include "xfs_bmap.h" 29 30 struct kmem_cache *xfs_extfree_item_cache; 31 32 struct workqueue_struct *xfs_alloc_wq; 33 34 #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) 35 36 #define XFSA_FIXUP_BNO_OK 1 37 #define XFSA_FIXUP_CNT_OK 2 38 39 /* 40 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in 41 * the beginning of the block for a proper header with the location information 42 * and CRC. 43 */ 44 unsigned int 45 xfs_agfl_size( 46 struct xfs_mount *mp) 47 { 48 unsigned int size = mp->m_sb.sb_sectsize; 49 50 if (xfs_has_crc(mp)) 51 size -= sizeof(struct xfs_agfl); 52 53 return size / sizeof(xfs_agblock_t); 54 } 55 56 unsigned int 57 xfs_refc_block( 58 struct xfs_mount *mp) 59 { 60 if (xfs_has_rmapbt(mp)) 61 return XFS_RMAP_BLOCK(mp) + 1; 62 if (xfs_has_finobt(mp)) 63 return XFS_FIBT_BLOCK(mp) + 1; 64 return XFS_IBT_BLOCK(mp) + 1; 65 } 66 67 xfs_extlen_t 68 xfs_prealloc_blocks( 69 struct xfs_mount *mp) 70 { 71 if (xfs_has_reflink(mp)) 72 return xfs_refc_block(mp) + 1; 73 if (xfs_has_rmapbt(mp)) 74 return XFS_RMAP_BLOCK(mp) + 1; 75 if (xfs_has_finobt(mp)) 76 return XFS_FIBT_BLOCK(mp) + 1; 77 return XFS_IBT_BLOCK(mp) + 1; 78 } 79 80 /* 81 * The number of blocks per AG that we withhold from xfs_mod_fdblocks to 82 * guarantee that we can refill the AGFL prior to allocating space in a nearly 83 * full AG. Although the space described by the free space btrees, the 84 * blocks used by the freesp btrees themselves, and the blocks owned by the 85 * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk 86 * free space in the AG drop so low that the free space btrees cannot refill an 87 * empty AGFL up to the minimum level. Rather than grind through empty AGs 88 * until the fs goes down, we subtract this many AG blocks from the incore 89 * fdblocks to ensure user allocation does not overcommit the space the 90 * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to 91 * withhold space from xfs_mod_fdblocks, so we do not account for that here. 92 */ 93 #define XFS_ALLOCBT_AGFL_RESERVE 4 94 95 /* 96 * Compute the number of blocks that we set aside to guarantee the ability to 97 * refill the AGFL and handle a full bmap btree split. 98 * 99 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of 100 * AGF buffer (PV 947395), we place constraints on the relationship among 101 * actual allocations for data blocks, freelist blocks, and potential file data 102 * bmap btree blocks. However, these restrictions may result in no actual space 103 * allocated for a delayed extent, for example, a data block in a certain AG is 104 * allocated but there is no additional block for the additional bmap btree 105 * block due to a split of the bmap btree of the file. The result of this may 106 * lead to an infinite loop when the file gets flushed to disk and all delayed 107 * extents need to be actually allocated. To get around this, we explicitly set 108 * aside a few blocks which will not be reserved in delayed allocation. 109 * 110 * For each AG, we need to reserve enough blocks to replenish a totally empty 111 * AGFL and 4 more to handle a potential split of the file's bmap btree. 112 */ 113 unsigned int 114 xfs_alloc_set_aside( 115 struct xfs_mount *mp) 116 { 117 return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4); 118 } 119 120 /* 121 * When deciding how much space to allocate out of an AG, we limit the 122 * allocation maximum size to the size the AG. However, we cannot use all the 123 * blocks in the AG - some are permanently used by metadata. These 124 * blocks are generally: 125 * - the AG superblock, AGF, AGI and AGFL 126 * - the AGF (bno and cnt) and AGI btree root blocks, and optionally 127 * the AGI free inode and rmap btree root blocks. 128 * - blocks on the AGFL according to xfs_alloc_set_aside() limits 129 * - the rmapbt root block 130 * 131 * The AG headers are sector sized, so the amount of space they take up is 132 * dependent on filesystem geometry. The others are all single blocks. 133 */ 134 unsigned int 135 xfs_alloc_ag_max_usable( 136 struct xfs_mount *mp) 137 { 138 unsigned int blocks; 139 140 blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ 141 blocks += XFS_ALLOCBT_AGFL_RESERVE; 142 blocks += 3; /* AGF, AGI btree root blocks */ 143 if (xfs_has_finobt(mp)) 144 blocks++; /* finobt root block */ 145 if (xfs_has_rmapbt(mp)) 146 blocks++; /* rmap root block */ 147 if (xfs_has_reflink(mp)) 148 blocks++; /* refcount root block */ 149 150 return mp->m_sb.sb_agblocks - blocks; 151 } 152 153 /* 154 * Lookup the record equal to [bno, len] in the btree given by cur. 155 */ 156 STATIC int /* error */ 157 xfs_alloc_lookup_eq( 158 struct xfs_btree_cur *cur, /* btree cursor */ 159 xfs_agblock_t bno, /* starting block of extent */ 160 xfs_extlen_t len, /* length of extent */ 161 int *stat) /* success/failure */ 162 { 163 int error; 164 165 cur->bc_rec.a.ar_startblock = bno; 166 cur->bc_rec.a.ar_blockcount = len; 167 error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat); 168 cur->bc_ag.abt.active = (*stat == 1); 169 return error; 170 } 171 172 /* 173 * Lookup the first record greater than or equal to [bno, len] 174 * in the btree given by cur. 175 */ 176 int /* error */ 177 xfs_alloc_lookup_ge( 178 struct xfs_btree_cur *cur, /* btree cursor */ 179 xfs_agblock_t bno, /* starting block of extent */ 180 xfs_extlen_t len, /* length of extent */ 181 int *stat) /* success/failure */ 182 { 183 int error; 184 185 cur->bc_rec.a.ar_startblock = bno; 186 cur->bc_rec.a.ar_blockcount = len; 187 error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat); 188 cur->bc_ag.abt.active = (*stat == 1); 189 return error; 190 } 191 192 /* 193 * Lookup the first record less than or equal to [bno, len] 194 * in the btree given by cur. 195 */ 196 int /* error */ 197 xfs_alloc_lookup_le( 198 struct xfs_btree_cur *cur, /* btree cursor */ 199 xfs_agblock_t bno, /* starting block of extent */ 200 xfs_extlen_t len, /* length of extent */ 201 int *stat) /* success/failure */ 202 { 203 int error; 204 cur->bc_rec.a.ar_startblock = bno; 205 cur->bc_rec.a.ar_blockcount = len; 206 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat); 207 cur->bc_ag.abt.active = (*stat == 1); 208 return error; 209 } 210 211 static inline bool 212 xfs_alloc_cur_active( 213 struct xfs_btree_cur *cur) 214 { 215 return cur && cur->bc_ag.abt.active; 216 } 217 218 /* 219 * Update the record referred to by cur to the value given 220 * by [bno, len]. 221 * This either works (return 0) or gets an EFSCORRUPTED error. 222 */ 223 STATIC int /* error */ 224 xfs_alloc_update( 225 struct xfs_btree_cur *cur, /* btree cursor */ 226 xfs_agblock_t bno, /* starting block of extent */ 227 xfs_extlen_t len) /* length of extent */ 228 { 229 union xfs_btree_rec rec; 230 231 rec.alloc.ar_startblock = cpu_to_be32(bno); 232 rec.alloc.ar_blockcount = cpu_to_be32(len); 233 return xfs_btree_update(cur, &rec); 234 } 235 236 /* Convert the ondisk btree record to its incore representation. */ 237 void 238 xfs_alloc_btrec_to_irec( 239 const union xfs_btree_rec *rec, 240 struct xfs_alloc_rec_incore *irec) 241 { 242 irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); 243 irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); 244 } 245 246 /* Simple checks for free space records. */ 247 xfs_failaddr_t 248 xfs_alloc_check_irec( 249 struct xfs_btree_cur *cur, 250 const struct xfs_alloc_rec_incore *irec) 251 { 252 struct xfs_perag *pag = cur->bc_ag.pag; 253 254 if (irec->ar_blockcount == 0) 255 return __this_address; 256 257 /* check for valid extent range, including overflow */ 258 if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount)) 259 return __this_address; 260 261 return NULL; 262 } 263 264 static inline int 265 xfs_alloc_complain_bad_rec( 266 struct xfs_btree_cur *cur, 267 xfs_failaddr_t fa, 268 const struct xfs_alloc_rec_incore *irec) 269 { 270 struct xfs_mount *mp = cur->bc_mp; 271 272 xfs_warn(mp, 273 "%s Freespace BTree record corruption in AG %d detected at %pS!", 274 cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size", 275 cur->bc_ag.pag->pag_agno, fa); 276 xfs_warn(mp, 277 "start block 0x%x block count 0x%x", irec->ar_startblock, 278 irec->ar_blockcount); 279 return -EFSCORRUPTED; 280 } 281 282 /* 283 * Get the data from the pointed-to record. 284 */ 285 int /* error */ 286 xfs_alloc_get_rec( 287 struct xfs_btree_cur *cur, /* btree cursor */ 288 xfs_agblock_t *bno, /* output: starting block of extent */ 289 xfs_extlen_t *len, /* output: length of extent */ 290 int *stat) /* output: success/failure */ 291 { 292 struct xfs_alloc_rec_incore irec; 293 union xfs_btree_rec *rec; 294 xfs_failaddr_t fa; 295 int error; 296 297 error = xfs_btree_get_rec(cur, &rec, stat); 298 if (error || !(*stat)) 299 return error; 300 301 xfs_alloc_btrec_to_irec(rec, &irec); 302 fa = xfs_alloc_check_irec(cur, &irec); 303 if (fa) 304 return xfs_alloc_complain_bad_rec(cur, fa, &irec); 305 306 *bno = irec.ar_startblock; 307 *len = irec.ar_blockcount; 308 return 0; 309 } 310 311 /* 312 * Compute aligned version of the found extent. 313 * Takes alignment and min length into account. 314 */ 315 STATIC bool 316 xfs_alloc_compute_aligned( 317 xfs_alloc_arg_t *args, /* allocation argument structure */ 318 xfs_agblock_t foundbno, /* starting block in found extent */ 319 xfs_extlen_t foundlen, /* length in found extent */ 320 xfs_agblock_t *resbno, /* result block number */ 321 xfs_extlen_t *reslen, /* result length */ 322 unsigned *busy_gen) 323 { 324 xfs_agblock_t bno = foundbno; 325 xfs_extlen_t len = foundlen; 326 xfs_extlen_t diff; 327 bool busy; 328 329 /* Trim busy sections out of found extent */ 330 busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen); 331 332 /* 333 * If we have a largish extent that happens to start before min_agbno, 334 * see if we can shift it into range... 335 */ 336 if (bno < args->min_agbno && bno + len > args->min_agbno) { 337 diff = args->min_agbno - bno; 338 if (len > diff) { 339 bno += diff; 340 len -= diff; 341 } 342 } 343 344 if (args->alignment > 1 && len >= args->minlen) { 345 xfs_agblock_t aligned_bno = roundup(bno, args->alignment); 346 347 diff = aligned_bno - bno; 348 349 *resbno = aligned_bno; 350 *reslen = diff >= len ? 0 : len - diff; 351 } else { 352 *resbno = bno; 353 *reslen = len; 354 } 355 356 return busy; 357 } 358 359 /* 360 * Compute best start block and diff for "near" allocations. 361 * freelen >= wantlen already checked by caller. 362 */ 363 STATIC xfs_extlen_t /* difference value (absolute) */ 364 xfs_alloc_compute_diff( 365 xfs_agblock_t wantbno, /* target starting block */ 366 xfs_extlen_t wantlen, /* target length */ 367 xfs_extlen_t alignment, /* target alignment */ 368 int datatype, /* are we allocating data? */ 369 xfs_agblock_t freebno, /* freespace's starting block */ 370 xfs_extlen_t freelen, /* freespace's length */ 371 xfs_agblock_t *newbnop) /* result: best start block from free */ 372 { 373 xfs_agblock_t freeend; /* end of freespace extent */ 374 xfs_agblock_t newbno1; /* return block number */ 375 xfs_agblock_t newbno2; /* other new block number */ 376 xfs_extlen_t newlen1=0; /* length with newbno1 */ 377 xfs_extlen_t newlen2=0; /* length with newbno2 */ 378 xfs_agblock_t wantend; /* end of target extent */ 379 bool userdata = datatype & XFS_ALLOC_USERDATA; 380 381 ASSERT(freelen >= wantlen); 382 freeend = freebno + freelen; 383 wantend = wantbno + wantlen; 384 /* 385 * We want to allocate from the start of a free extent if it is past 386 * the desired block or if we are allocating user data and the free 387 * extent is before desired block. The second case is there to allow 388 * for contiguous allocation from the remaining free space if the file 389 * grows in the short term. 390 */ 391 if (freebno >= wantbno || (userdata && freeend < wantend)) { 392 if ((newbno1 = roundup(freebno, alignment)) >= freeend) 393 newbno1 = NULLAGBLOCK; 394 } else if (freeend >= wantend && alignment > 1) { 395 newbno1 = roundup(wantbno, alignment); 396 newbno2 = newbno1 - alignment; 397 if (newbno1 >= freeend) 398 newbno1 = NULLAGBLOCK; 399 else 400 newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1); 401 if (newbno2 < freebno) 402 newbno2 = NULLAGBLOCK; 403 else 404 newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2); 405 if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { 406 if (newlen1 < newlen2 || 407 (newlen1 == newlen2 && 408 XFS_ABSDIFF(newbno1, wantbno) > 409 XFS_ABSDIFF(newbno2, wantbno))) 410 newbno1 = newbno2; 411 } else if (newbno2 != NULLAGBLOCK) 412 newbno1 = newbno2; 413 } else if (freeend >= wantend) { 414 newbno1 = wantbno; 415 } else if (alignment > 1) { 416 newbno1 = roundup(freeend - wantlen, alignment); 417 if (newbno1 > freeend - wantlen && 418 newbno1 - alignment >= freebno) 419 newbno1 -= alignment; 420 else if (newbno1 >= freeend) 421 newbno1 = NULLAGBLOCK; 422 } else 423 newbno1 = freeend - wantlen; 424 *newbnop = newbno1; 425 return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno); 426 } 427 428 /* 429 * Fix up the length, based on mod and prod. 430 * len should be k * prod + mod for some k. 431 * If len is too small it is returned unchanged. 432 * If len hits maxlen it is left alone. 433 */ 434 STATIC void 435 xfs_alloc_fix_len( 436 xfs_alloc_arg_t *args) /* allocation argument structure */ 437 { 438 xfs_extlen_t k; 439 xfs_extlen_t rlen; 440 441 ASSERT(args->mod < args->prod); 442 rlen = args->len; 443 ASSERT(rlen >= args->minlen); 444 ASSERT(rlen <= args->maxlen); 445 if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen || 446 (args->mod == 0 && rlen < args->prod)) 447 return; 448 k = rlen % args->prod; 449 if (k == args->mod) 450 return; 451 if (k > args->mod) 452 rlen = rlen - (k - args->mod); 453 else 454 rlen = rlen - args->prod + (args->mod - k); 455 /* casts to (int) catch length underflows */ 456 if ((int)rlen < (int)args->minlen) 457 return; 458 ASSERT(rlen >= args->minlen && rlen <= args->maxlen); 459 ASSERT(rlen % args->prod == args->mod); 460 ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >= 461 rlen + args->minleft); 462 args->len = rlen; 463 } 464 465 /* 466 * Update the two btrees, logically removing from freespace the extent 467 * starting at rbno, rlen blocks. The extent is contained within the 468 * actual (current) free extent fbno for flen blocks. 469 * Flags are passed in indicating whether the cursors are set to the 470 * relevant records. 471 */ 472 STATIC int /* error code */ 473 xfs_alloc_fixup_trees( 474 struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */ 475 struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */ 476 xfs_agblock_t fbno, /* starting block of free extent */ 477 xfs_extlen_t flen, /* length of free extent */ 478 xfs_agblock_t rbno, /* starting block of returned extent */ 479 xfs_extlen_t rlen, /* length of returned extent */ 480 int flags) /* flags, XFSA_FIXUP_... */ 481 { 482 int error; /* error code */ 483 int i; /* operation results */ 484 xfs_agblock_t nfbno1; /* first new free startblock */ 485 xfs_agblock_t nfbno2; /* second new free startblock */ 486 xfs_extlen_t nflen1=0; /* first new free length */ 487 xfs_extlen_t nflen2=0; /* second new free length */ 488 struct xfs_mount *mp; 489 490 mp = cnt_cur->bc_mp; 491 492 /* 493 * Look up the record in the by-size tree if necessary. 494 */ 495 if (flags & XFSA_FIXUP_CNT_OK) { 496 #ifdef DEBUG 497 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) 498 return error; 499 if (XFS_IS_CORRUPT(mp, 500 i != 1 || 501 nfbno1 != fbno || 502 nflen1 != flen)) 503 return -EFSCORRUPTED; 504 #endif 505 } else { 506 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) 507 return error; 508 if (XFS_IS_CORRUPT(mp, i != 1)) 509 return -EFSCORRUPTED; 510 } 511 /* 512 * Look up the record in the by-block tree if necessary. 513 */ 514 if (flags & XFSA_FIXUP_BNO_OK) { 515 #ifdef DEBUG 516 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) 517 return error; 518 if (XFS_IS_CORRUPT(mp, 519 i != 1 || 520 nfbno1 != fbno || 521 nflen1 != flen)) 522 return -EFSCORRUPTED; 523 #endif 524 } else { 525 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) 526 return error; 527 if (XFS_IS_CORRUPT(mp, i != 1)) 528 return -EFSCORRUPTED; 529 } 530 531 #ifdef DEBUG 532 if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) { 533 struct xfs_btree_block *bnoblock; 534 struct xfs_btree_block *cntblock; 535 536 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp); 537 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp); 538 539 if (XFS_IS_CORRUPT(mp, 540 bnoblock->bb_numrecs != 541 cntblock->bb_numrecs)) 542 return -EFSCORRUPTED; 543 } 544 #endif 545 546 /* 547 * Deal with all four cases: the allocated record is contained 548 * within the freespace record, so we can have new freespace 549 * at either (or both) end, or no freespace remaining. 550 */ 551 if (rbno == fbno && rlen == flen) 552 nfbno1 = nfbno2 = NULLAGBLOCK; 553 else if (rbno == fbno) { 554 nfbno1 = rbno + rlen; 555 nflen1 = flen - rlen; 556 nfbno2 = NULLAGBLOCK; 557 } else if (rbno + rlen == fbno + flen) { 558 nfbno1 = fbno; 559 nflen1 = flen - rlen; 560 nfbno2 = NULLAGBLOCK; 561 } else { 562 nfbno1 = fbno; 563 nflen1 = rbno - fbno; 564 nfbno2 = rbno + rlen; 565 nflen2 = (fbno + flen) - nfbno2; 566 } 567 /* 568 * Delete the entry from the by-size btree. 569 */ 570 if ((error = xfs_btree_delete(cnt_cur, &i))) 571 return error; 572 if (XFS_IS_CORRUPT(mp, i != 1)) 573 return -EFSCORRUPTED; 574 /* 575 * Add new by-size btree entry(s). 576 */ 577 if (nfbno1 != NULLAGBLOCK) { 578 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) 579 return error; 580 if (XFS_IS_CORRUPT(mp, i != 0)) 581 return -EFSCORRUPTED; 582 if ((error = xfs_btree_insert(cnt_cur, &i))) 583 return error; 584 if (XFS_IS_CORRUPT(mp, i != 1)) 585 return -EFSCORRUPTED; 586 } 587 if (nfbno2 != NULLAGBLOCK) { 588 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) 589 return error; 590 if (XFS_IS_CORRUPT(mp, i != 0)) 591 return -EFSCORRUPTED; 592 if ((error = xfs_btree_insert(cnt_cur, &i))) 593 return error; 594 if (XFS_IS_CORRUPT(mp, i != 1)) 595 return -EFSCORRUPTED; 596 } 597 /* 598 * Fix up the by-block btree entry(s). 599 */ 600 if (nfbno1 == NULLAGBLOCK) { 601 /* 602 * No remaining freespace, just delete the by-block tree entry. 603 */ 604 if ((error = xfs_btree_delete(bno_cur, &i))) 605 return error; 606 if (XFS_IS_CORRUPT(mp, i != 1)) 607 return -EFSCORRUPTED; 608 } else { 609 /* 610 * Update the by-block entry to start later|be shorter. 611 */ 612 if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1))) 613 return error; 614 } 615 if (nfbno2 != NULLAGBLOCK) { 616 /* 617 * 2 resulting free entries, need to add one. 618 */ 619 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) 620 return error; 621 if (XFS_IS_CORRUPT(mp, i != 0)) 622 return -EFSCORRUPTED; 623 if ((error = xfs_btree_insert(bno_cur, &i))) 624 return error; 625 if (XFS_IS_CORRUPT(mp, i != 1)) 626 return -EFSCORRUPTED; 627 } 628 return 0; 629 } 630 631 static xfs_failaddr_t 632 xfs_agfl_verify( 633 struct xfs_buf *bp) 634 { 635 struct xfs_mount *mp = bp->b_mount; 636 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); 637 __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp); 638 int i; 639 640 /* 641 * There is no verification of non-crc AGFLs because mkfs does not 642 * initialise the AGFL to zero or NULL. Hence the only valid part of the 643 * AGFL is what the AGF says is active. We can't get to the AGF, so we 644 * can't verify just those entries are valid. 645 */ 646 if (!xfs_has_crc(mp)) 647 return NULL; 648 649 if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) 650 return __this_address; 651 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) 652 return __this_address; 653 /* 654 * during growfs operations, the perag is not fully initialised, 655 * so we can't use it for any useful checking. growfs ensures we can't 656 * use it by using uncached buffers that don't have the perag attached 657 * so we can detect and avoid this problem. 658 */ 659 if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) 660 return __this_address; 661 662 for (i = 0; i < xfs_agfl_size(mp); i++) { 663 if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK && 664 be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks) 665 return __this_address; 666 } 667 668 if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn))) 669 return __this_address; 670 return NULL; 671 } 672 673 static void 674 xfs_agfl_read_verify( 675 struct xfs_buf *bp) 676 { 677 struct xfs_mount *mp = bp->b_mount; 678 xfs_failaddr_t fa; 679 680 /* 681 * There is no verification of non-crc AGFLs because mkfs does not 682 * initialise the AGFL to zero or NULL. Hence the only valid part of the 683 * AGFL is what the AGF says is active. We can't get to the AGF, so we 684 * can't verify just those entries are valid. 685 */ 686 if (!xfs_has_crc(mp)) 687 return; 688 689 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) 690 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 691 else { 692 fa = xfs_agfl_verify(bp); 693 if (fa) 694 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 695 } 696 } 697 698 static void 699 xfs_agfl_write_verify( 700 struct xfs_buf *bp) 701 { 702 struct xfs_mount *mp = bp->b_mount; 703 struct xfs_buf_log_item *bip = bp->b_log_item; 704 xfs_failaddr_t fa; 705 706 /* no verification of non-crc AGFLs */ 707 if (!xfs_has_crc(mp)) 708 return; 709 710 fa = xfs_agfl_verify(bp); 711 if (fa) { 712 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 713 return; 714 } 715 716 if (bip) 717 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); 718 719 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF); 720 } 721 722 const struct xfs_buf_ops xfs_agfl_buf_ops = { 723 .name = "xfs_agfl", 724 .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) }, 725 .verify_read = xfs_agfl_read_verify, 726 .verify_write = xfs_agfl_write_verify, 727 .verify_struct = xfs_agfl_verify, 728 }; 729 730 /* 731 * Read in the allocation group free block array. 732 */ 733 int 734 xfs_alloc_read_agfl( 735 struct xfs_perag *pag, 736 struct xfs_trans *tp, 737 struct xfs_buf **bpp) 738 { 739 struct xfs_mount *mp = pag->pag_mount; 740 struct xfs_buf *bp; 741 int error; 742 743 error = xfs_trans_read_buf( 744 mp, tp, mp->m_ddev_targp, 745 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)), 746 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); 747 if (error) 748 return error; 749 xfs_buf_set_ref(bp, XFS_AGFL_REF); 750 *bpp = bp; 751 return 0; 752 } 753 754 STATIC int 755 xfs_alloc_update_counters( 756 struct xfs_trans *tp, 757 struct xfs_buf *agbp, 758 long len) 759 { 760 struct xfs_agf *agf = agbp->b_addr; 761 762 agbp->b_pag->pagf_freeblks += len; 763 be32_add_cpu(&agf->agf_freeblks, len); 764 765 if (unlikely(be32_to_cpu(agf->agf_freeblks) > 766 be32_to_cpu(agf->agf_length))) { 767 xfs_buf_mark_corrupt(agbp); 768 return -EFSCORRUPTED; 769 } 770 771 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); 772 return 0; 773 } 774 775 /* 776 * Block allocation algorithm and data structures. 777 */ 778 struct xfs_alloc_cur { 779 struct xfs_btree_cur *cnt; /* btree cursors */ 780 struct xfs_btree_cur *bnolt; 781 struct xfs_btree_cur *bnogt; 782 xfs_extlen_t cur_len;/* current search length */ 783 xfs_agblock_t rec_bno;/* extent startblock */ 784 xfs_extlen_t rec_len;/* extent length */ 785 xfs_agblock_t bno; /* alloc bno */ 786 xfs_extlen_t len; /* alloc len */ 787 xfs_extlen_t diff; /* diff from search bno */ 788 unsigned int busy_gen;/* busy state */ 789 bool busy; 790 }; 791 792 /* 793 * Set up cursors, etc. in the extent allocation cursor. This function can be 794 * called multiple times to reset an initialized structure without having to 795 * reallocate cursors. 796 */ 797 static int 798 xfs_alloc_cur_setup( 799 struct xfs_alloc_arg *args, 800 struct xfs_alloc_cur *acur) 801 { 802 int error; 803 int i; 804 805 acur->cur_len = args->maxlen; 806 acur->rec_bno = 0; 807 acur->rec_len = 0; 808 acur->bno = 0; 809 acur->len = 0; 810 acur->diff = -1; 811 acur->busy = false; 812 acur->busy_gen = 0; 813 814 /* 815 * Perform an initial cntbt lookup to check for availability of maxlen 816 * extents. If this fails, we'll return -ENOSPC to signal the caller to 817 * attempt a small allocation. 818 */ 819 if (!acur->cnt) 820 acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp, 821 args->agbp, args->pag, XFS_BTNUM_CNT); 822 error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); 823 if (error) 824 return error; 825 826 /* 827 * Allocate the bnobt left and right search cursors. 828 */ 829 if (!acur->bnolt) 830 acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp, 831 args->agbp, args->pag, XFS_BTNUM_BNO); 832 if (!acur->bnogt) 833 acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp, 834 args->agbp, args->pag, XFS_BTNUM_BNO); 835 return i == 1 ? 0 : -ENOSPC; 836 } 837 838 static void 839 xfs_alloc_cur_close( 840 struct xfs_alloc_cur *acur, 841 bool error) 842 { 843 int cur_error = XFS_BTREE_NOERROR; 844 845 if (error) 846 cur_error = XFS_BTREE_ERROR; 847 848 if (acur->cnt) 849 xfs_btree_del_cursor(acur->cnt, cur_error); 850 if (acur->bnolt) 851 xfs_btree_del_cursor(acur->bnolt, cur_error); 852 if (acur->bnogt) 853 xfs_btree_del_cursor(acur->bnogt, cur_error); 854 acur->cnt = acur->bnolt = acur->bnogt = NULL; 855 } 856 857 /* 858 * Check an extent for allocation and track the best available candidate in the 859 * allocation structure. The cursor is deactivated if it has entered an out of 860 * range state based on allocation arguments. Optionally return the extent 861 * extent geometry and allocation status if requested by the caller. 862 */ 863 static int 864 xfs_alloc_cur_check( 865 struct xfs_alloc_arg *args, 866 struct xfs_alloc_cur *acur, 867 struct xfs_btree_cur *cur, 868 int *new) 869 { 870 int error, i; 871 xfs_agblock_t bno, bnoa, bnew; 872 xfs_extlen_t len, lena, diff = -1; 873 bool busy; 874 unsigned busy_gen = 0; 875 bool deactivate = false; 876 bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO; 877 878 *new = 0; 879 880 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 881 if (error) 882 return error; 883 if (XFS_IS_CORRUPT(args->mp, i != 1)) 884 return -EFSCORRUPTED; 885 886 /* 887 * Check minlen and deactivate a cntbt cursor if out of acceptable size 888 * range (i.e., walking backwards looking for a minlen extent). 889 */ 890 if (len < args->minlen) { 891 deactivate = !isbnobt; 892 goto out; 893 } 894 895 busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena, 896 &busy_gen); 897 acur->busy |= busy; 898 if (busy) 899 acur->busy_gen = busy_gen; 900 /* deactivate a bnobt cursor outside of locality range */ 901 if (bnoa < args->min_agbno || bnoa > args->max_agbno) { 902 deactivate = isbnobt; 903 goto out; 904 } 905 if (lena < args->minlen) 906 goto out; 907 908 args->len = XFS_EXTLEN_MIN(lena, args->maxlen); 909 xfs_alloc_fix_len(args); 910 ASSERT(args->len >= args->minlen); 911 if (args->len < acur->len) 912 goto out; 913 914 /* 915 * We have an aligned record that satisfies minlen and beats or matches 916 * the candidate extent size. Compare locality for near allocation mode. 917 */ 918 diff = xfs_alloc_compute_diff(args->agbno, args->len, 919 args->alignment, args->datatype, 920 bnoa, lena, &bnew); 921 if (bnew == NULLAGBLOCK) 922 goto out; 923 924 /* 925 * Deactivate a bnobt cursor with worse locality than the current best. 926 */ 927 if (diff > acur->diff) { 928 deactivate = isbnobt; 929 goto out; 930 } 931 932 ASSERT(args->len > acur->len || 933 (args->len == acur->len && diff <= acur->diff)); 934 acur->rec_bno = bno; 935 acur->rec_len = len; 936 acur->bno = bnew; 937 acur->len = args->len; 938 acur->diff = diff; 939 *new = 1; 940 941 /* 942 * We're done if we found a perfect allocation. This only deactivates 943 * the current cursor, but this is just an optimization to terminate a 944 * cntbt search that otherwise runs to the edge of the tree. 945 */ 946 if (acur->diff == 0 && acur->len == args->maxlen) 947 deactivate = true; 948 out: 949 if (deactivate) 950 cur->bc_ag.abt.active = false; 951 trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff, 952 *new); 953 return 0; 954 } 955 956 /* 957 * Complete an allocation of a candidate extent. Remove the extent from both 958 * trees and update the args structure. 959 */ 960 STATIC int 961 xfs_alloc_cur_finish( 962 struct xfs_alloc_arg *args, 963 struct xfs_alloc_cur *acur) 964 { 965 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; 966 int error; 967 968 ASSERT(acur->cnt && acur->bnolt); 969 ASSERT(acur->bno >= acur->rec_bno); 970 ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); 971 ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length)); 972 973 error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, 974 acur->rec_len, acur->bno, acur->len, 0); 975 if (error) 976 return error; 977 978 args->agbno = acur->bno; 979 args->len = acur->len; 980 args->wasfromfl = 0; 981 982 trace_xfs_alloc_cur(args); 983 return 0; 984 } 985 986 /* 987 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses 988 * bno optimized lookup to search for extents with ideal size and locality. 989 */ 990 STATIC int 991 xfs_alloc_cntbt_iter( 992 struct xfs_alloc_arg *args, 993 struct xfs_alloc_cur *acur) 994 { 995 struct xfs_btree_cur *cur = acur->cnt; 996 xfs_agblock_t bno; 997 xfs_extlen_t len, cur_len; 998 int error; 999 int i; 1000 1001 if (!xfs_alloc_cur_active(cur)) 1002 return 0; 1003 1004 /* locality optimized lookup */ 1005 cur_len = acur->cur_len; 1006 error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i); 1007 if (error) 1008 return error; 1009 if (i == 0) 1010 return 0; 1011 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 1012 if (error) 1013 return error; 1014 1015 /* check the current record and update search length from it */ 1016 error = xfs_alloc_cur_check(args, acur, cur, &i); 1017 if (error) 1018 return error; 1019 ASSERT(len >= acur->cur_len); 1020 acur->cur_len = len; 1021 1022 /* 1023 * We looked up the first record >= [agbno, len] above. The agbno is a 1024 * secondary key and so the current record may lie just before or after 1025 * agbno. If it is past agbno, check the previous record too so long as 1026 * the length matches as it may be closer. Don't check a smaller record 1027 * because that could deactivate our cursor. 1028 */ 1029 if (bno > args->agbno) { 1030 error = xfs_btree_decrement(cur, 0, &i); 1031 if (!error && i) { 1032 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 1033 if (!error && i && len == acur->cur_len) 1034 error = xfs_alloc_cur_check(args, acur, cur, 1035 &i); 1036 } 1037 if (error) 1038 return error; 1039 } 1040 1041 /* 1042 * Increment the search key until we find at least one allocation 1043 * candidate or if the extent we found was larger. Otherwise, double the 1044 * search key to optimize the search. Efficiency is more important here 1045 * than absolute best locality. 1046 */ 1047 cur_len <<= 1; 1048 if (!acur->len || acur->cur_len >= cur_len) 1049 acur->cur_len++; 1050 else 1051 acur->cur_len = cur_len; 1052 1053 return error; 1054 } 1055 1056 /* 1057 * Deal with the case where only small freespaces remain. Either return the 1058 * contents of the last freespace record, or allocate space from the freelist if 1059 * there is nothing in the tree. 1060 */ 1061 STATIC int /* error */ 1062 xfs_alloc_ag_vextent_small( 1063 struct xfs_alloc_arg *args, /* allocation argument structure */ 1064 struct xfs_btree_cur *ccur, /* optional by-size cursor */ 1065 xfs_agblock_t *fbnop, /* result block number */ 1066 xfs_extlen_t *flenp, /* result length */ 1067 int *stat) /* status: 0-freelist, 1-normal/none */ 1068 { 1069 struct xfs_agf *agf = args->agbp->b_addr; 1070 int error = 0; 1071 xfs_agblock_t fbno = NULLAGBLOCK; 1072 xfs_extlen_t flen = 0; 1073 int i = 0; 1074 1075 /* 1076 * If a cntbt cursor is provided, try to allocate the largest record in 1077 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the 1078 * allocation. Make sure to respect minleft even when pulling from the 1079 * freelist. 1080 */ 1081 if (ccur) 1082 error = xfs_btree_decrement(ccur, 0, &i); 1083 if (error) 1084 goto error; 1085 if (i) { 1086 error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i); 1087 if (error) 1088 goto error; 1089 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1090 error = -EFSCORRUPTED; 1091 goto error; 1092 } 1093 goto out; 1094 } 1095 1096 if (args->minlen != 1 || args->alignment != 1 || 1097 args->resv == XFS_AG_RESV_AGFL || 1098 be32_to_cpu(agf->agf_flcount) <= args->minleft) 1099 goto out; 1100 1101 error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp, 1102 &fbno, 0); 1103 if (error) 1104 goto error; 1105 if (fbno == NULLAGBLOCK) 1106 goto out; 1107 1108 xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1, 1109 (args->datatype & XFS_ALLOC_NOBUSY)); 1110 1111 if (args->datatype & XFS_ALLOC_USERDATA) { 1112 struct xfs_buf *bp; 1113 1114 error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp, 1115 XFS_AGB_TO_DADDR(args->mp, args->agno, fbno), 1116 args->mp->m_bsize, 0, &bp); 1117 if (error) 1118 goto error; 1119 xfs_trans_binval(args->tp, bp); 1120 } 1121 *fbnop = args->agbno = fbno; 1122 *flenp = args->len = 1; 1123 if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) { 1124 error = -EFSCORRUPTED; 1125 goto error; 1126 } 1127 args->wasfromfl = 1; 1128 trace_xfs_alloc_small_freelist(args); 1129 1130 /* 1131 * If we're feeding an AGFL block to something that doesn't live in the 1132 * free space, we need to clear out the OWN_AG rmap. 1133 */ 1134 error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1, 1135 &XFS_RMAP_OINFO_AG); 1136 if (error) 1137 goto error; 1138 1139 *stat = 0; 1140 return 0; 1141 1142 out: 1143 /* 1144 * Can't do the allocation, give up. 1145 */ 1146 if (flen < args->minlen) { 1147 args->agbno = NULLAGBLOCK; 1148 trace_xfs_alloc_small_notenough(args); 1149 flen = 0; 1150 } 1151 *fbnop = fbno; 1152 *flenp = flen; 1153 *stat = 1; 1154 trace_xfs_alloc_small_done(args); 1155 return 0; 1156 1157 error: 1158 trace_xfs_alloc_small_error(args); 1159 return error; 1160 } 1161 1162 /* 1163 * Allocate a variable extent at exactly agno/bno. 1164 * Extent's length (returned in *len) will be between minlen and maxlen, 1165 * and of the form k * prod + mod unless there's nothing that large. 1166 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it. 1167 */ 1168 STATIC int /* error */ 1169 xfs_alloc_ag_vextent_exact( 1170 xfs_alloc_arg_t *args) /* allocation argument structure */ 1171 { 1172 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; 1173 struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */ 1174 struct xfs_btree_cur *cnt_cur;/* by count btree cursor */ 1175 int error; 1176 xfs_agblock_t fbno; /* start block of found extent */ 1177 xfs_extlen_t flen; /* length of found extent */ 1178 xfs_agblock_t tbno; /* start block of busy extent */ 1179 xfs_extlen_t tlen; /* length of busy extent */ 1180 xfs_agblock_t tend; /* end block of busy extent */ 1181 int i; /* success/failure of operation */ 1182 unsigned busy_gen; 1183 1184 ASSERT(args->alignment == 1); 1185 1186 /* 1187 * Allocate/initialize a cursor for the by-number freespace btree. 1188 */ 1189 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1190 args->pag, XFS_BTNUM_BNO); 1191 1192 /* 1193 * Lookup bno and minlen in the btree (minlen is irrelevant, really). 1194 * Look for the closest free block <= bno, it must contain bno 1195 * if any free block does. 1196 */ 1197 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i); 1198 if (error) 1199 goto error0; 1200 if (!i) 1201 goto not_found; 1202 1203 /* 1204 * Grab the freespace record. 1205 */ 1206 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); 1207 if (error) 1208 goto error0; 1209 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1210 error = -EFSCORRUPTED; 1211 goto error0; 1212 } 1213 ASSERT(fbno <= args->agbno); 1214 1215 /* 1216 * Check for overlapping busy extents. 1217 */ 1218 tbno = fbno; 1219 tlen = flen; 1220 xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen); 1221 1222 /* 1223 * Give up if the start of the extent is busy, or the freespace isn't 1224 * long enough for the minimum request. 1225 */ 1226 if (tbno > args->agbno) 1227 goto not_found; 1228 if (tlen < args->minlen) 1229 goto not_found; 1230 tend = tbno + tlen; 1231 if (tend < args->agbno + args->minlen) 1232 goto not_found; 1233 1234 /* 1235 * End of extent will be smaller of the freespace end and the 1236 * maximal requested end. 1237 * 1238 * Fix the length according to mod and prod if given. 1239 */ 1240 args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen) 1241 - args->agbno; 1242 xfs_alloc_fix_len(args); 1243 ASSERT(args->agbno + args->len <= tend); 1244 1245 /* 1246 * We are allocating agbno for args->len 1247 * Allocate/initialize a cursor for the by-size btree. 1248 */ 1249 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1250 args->pag, XFS_BTNUM_CNT); 1251 ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); 1252 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, 1253 args->len, XFSA_FIXUP_BNO_OK); 1254 if (error) { 1255 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 1256 goto error0; 1257 } 1258 1259 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1260 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1261 1262 args->wasfromfl = 0; 1263 trace_xfs_alloc_exact_done(args); 1264 return 0; 1265 1266 not_found: 1267 /* Didn't find it, return null. */ 1268 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1269 args->agbno = NULLAGBLOCK; 1270 trace_xfs_alloc_exact_notfound(args); 1271 return 0; 1272 1273 error0: 1274 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1275 trace_xfs_alloc_exact_error(args); 1276 return error; 1277 } 1278 1279 /* 1280 * Search a given number of btree records in a given direction. Check each 1281 * record against the good extent we've already found. 1282 */ 1283 STATIC int 1284 xfs_alloc_walk_iter( 1285 struct xfs_alloc_arg *args, 1286 struct xfs_alloc_cur *acur, 1287 struct xfs_btree_cur *cur, 1288 bool increment, 1289 bool find_one, /* quit on first candidate */ 1290 int count, /* rec count (-1 for infinite) */ 1291 int *stat) 1292 { 1293 int error; 1294 int i; 1295 1296 *stat = 0; 1297 1298 /* 1299 * Search so long as the cursor is active or we find a better extent. 1300 * The cursor is deactivated if it extends beyond the range of the 1301 * current allocation candidate. 1302 */ 1303 while (xfs_alloc_cur_active(cur) && count) { 1304 error = xfs_alloc_cur_check(args, acur, cur, &i); 1305 if (error) 1306 return error; 1307 if (i == 1) { 1308 *stat = 1; 1309 if (find_one) 1310 break; 1311 } 1312 if (!xfs_alloc_cur_active(cur)) 1313 break; 1314 1315 if (increment) 1316 error = xfs_btree_increment(cur, 0, &i); 1317 else 1318 error = xfs_btree_decrement(cur, 0, &i); 1319 if (error) 1320 return error; 1321 if (i == 0) 1322 cur->bc_ag.abt.active = false; 1323 1324 if (count > 0) 1325 count--; 1326 } 1327 1328 return 0; 1329 } 1330 1331 /* 1332 * Search the by-bno and by-size btrees in parallel in search of an extent with 1333 * ideal locality based on the NEAR mode ->agbno locality hint. 1334 */ 1335 STATIC int 1336 xfs_alloc_ag_vextent_locality( 1337 struct xfs_alloc_arg *args, 1338 struct xfs_alloc_cur *acur, 1339 int *stat) 1340 { 1341 struct xfs_btree_cur *fbcur = NULL; 1342 int error; 1343 int i; 1344 bool fbinc; 1345 1346 ASSERT(acur->len == 0); 1347 1348 *stat = 0; 1349 1350 error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i); 1351 if (error) 1352 return error; 1353 error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i); 1354 if (error) 1355 return error; 1356 error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i); 1357 if (error) 1358 return error; 1359 1360 /* 1361 * Search the bnobt and cntbt in parallel. Search the bnobt left and 1362 * right and lookup the closest extent to the locality hint for each 1363 * extent size key in the cntbt. The entire search terminates 1364 * immediately on a bnobt hit because that means we've found best case 1365 * locality. Otherwise the search continues until the cntbt cursor runs 1366 * off the end of the tree. If no allocation candidate is found at this 1367 * point, give up on locality, walk backwards from the end of the cntbt 1368 * and take the first available extent. 1369 * 1370 * The parallel tree searches balance each other out to provide fairly 1371 * consistent performance for various situations. The bnobt search can 1372 * have pathological behavior in the worst case scenario of larger 1373 * allocation requests and fragmented free space. On the other hand, the 1374 * bnobt is able to satisfy most smaller allocation requests much more 1375 * quickly than the cntbt. The cntbt search can sift through fragmented 1376 * free space and sets of free extents for larger allocation requests 1377 * more quickly than the bnobt. Since the locality hint is just a hint 1378 * and we don't want to scan the entire bnobt for perfect locality, the 1379 * cntbt search essentially bounds the bnobt search such that we can 1380 * find good enough locality at reasonable performance in most cases. 1381 */ 1382 while (xfs_alloc_cur_active(acur->bnolt) || 1383 xfs_alloc_cur_active(acur->bnogt) || 1384 xfs_alloc_cur_active(acur->cnt)) { 1385 1386 trace_xfs_alloc_cur_lookup(args); 1387 1388 /* 1389 * Search the bnobt left and right. In the case of a hit, finish 1390 * the search in the opposite direction and we're done. 1391 */ 1392 error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false, 1393 true, 1, &i); 1394 if (error) 1395 return error; 1396 if (i == 1) { 1397 trace_xfs_alloc_cur_left(args); 1398 fbcur = acur->bnogt; 1399 fbinc = true; 1400 break; 1401 } 1402 error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true, 1403 1, &i); 1404 if (error) 1405 return error; 1406 if (i == 1) { 1407 trace_xfs_alloc_cur_right(args); 1408 fbcur = acur->bnolt; 1409 fbinc = false; 1410 break; 1411 } 1412 1413 /* 1414 * Check the extent with best locality based on the current 1415 * extent size search key and keep track of the best candidate. 1416 */ 1417 error = xfs_alloc_cntbt_iter(args, acur); 1418 if (error) 1419 return error; 1420 if (!xfs_alloc_cur_active(acur->cnt)) { 1421 trace_xfs_alloc_cur_lookup_done(args); 1422 break; 1423 } 1424 } 1425 1426 /* 1427 * If we failed to find anything due to busy extents, return empty 1428 * handed so the caller can flush and retry. If no busy extents were 1429 * found, walk backwards from the end of the cntbt as a last resort. 1430 */ 1431 if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) { 1432 error = xfs_btree_decrement(acur->cnt, 0, &i); 1433 if (error) 1434 return error; 1435 if (i) { 1436 acur->cnt->bc_ag.abt.active = true; 1437 fbcur = acur->cnt; 1438 fbinc = false; 1439 } 1440 } 1441 1442 /* 1443 * Search in the opposite direction for a better entry in the case of 1444 * a bnobt hit or walk backwards from the end of the cntbt. 1445 */ 1446 if (fbcur) { 1447 error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1, 1448 &i); 1449 if (error) 1450 return error; 1451 } 1452 1453 if (acur->len) 1454 *stat = 1; 1455 1456 return 0; 1457 } 1458 1459 /* Check the last block of the cnt btree for allocations. */ 1460 static int 1461 xfs_alloc_ag_vextent_lastblock( 1462 struct xfs_alloc_arg *args, 1463 struct xfs_alloc_cur *acur, 1464 xfs_agblock_t *bno, 1465 xfs_extlen_t *len, 1466 bool *allocated) 1467 { 1468 int error; 1469 int i; 1470 1471 #ifdef DEBUG 1472 /* Randomly don't execute the first algorithm. */ 1473 if (get_random_u32_below(2)) 1474 return 0; 1475 #endif 1476 1477 /* 1478 * Start from the entry that lookup found, sequence through all larger 1479 * free blocks. If we're actually pointing at a record smaller than 1480 * maxlen, go to the start of this block, and skip all those smaller 1481 * than minlen. 1482 */ 1483 if (*len || args->alignment > 1) { 1484 acur->cnt->bc_levels[0].ptr = 1; 1485 do { 1486 error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); 1487 if (error) 1488 return error; 1489 if (XFS_IS_CORRUPT(args->mp, i != 1)) 1490 return -EFSCORRUPTED; 1491 if (*len >= args->minlen) 1492 break; 1493 error = xfs_btree_increment(acur->cnt, 0, &i); 1494 if (error) 1495 return error; 1496 } while (i); 1497 ASSERT(*len >= args->minlen); 1498 if (!i) 1499 return 0; 1500 } 1501 1502 error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i); 1503 if (error) 1504 return error; 1505 1506 /* 1507 * It didn't work. We COULD be in a case where there's a good record 1508 * somewhere, so try again. 1509 */ 1510 if (acur->len == 0) 1511 return 0; 1512 1513 trace_xfs_alloc_near_first(args); 1514 *allocated = true; 1515 return 0; 1516 } 1517 1518 /* 1519 * Allocate a variable extent near bno in the allocation group agno. 1520 * Extent's length (returned in len) will be between minlen and maxlen, 1521 * and of the form k * prod + mod unless there's nothing that large. 1522 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1523 */ 1524 STATIC int 1525 xfs_alloc_ag_vextent_near( 1526 struct xfs_alloc_arg *args) 1527 { 1528 struct xfs_alloc_cur acur = {}; 1529 int error; /* error code */ 1530 int i; /* result code, temporary */ 1531 xfs_agblock_t bno; 1532 xfs_extlen_t len; 1533 1534 /* handle uninitialized agbno range so caller doesn't have to */ 1535 if (!args->min_agbno && !args->max_agbno) 1536 args->max_agbno = args->mp->m_sb.sb_agblocks - 1; 1537 ASSERT(args->min_agbno <= args->max_agbno); 1538 1539 /* clamp agbno to the range if it's outside */ 1540 if (args->agbno < args->min_agbno) 1541 args->agbno = args->min_agbno; 1542 if (args->agbno > args->max_agbno) 1543 args->agbno = args->max_agbno; 1544 1545 restart: 1546 len = 0; 1547 1548 /* 1549 * Set up cursors and see if there are any free extents as big as 1550 * maxlen. If not, pick the last entry in the tree unless the tree is 1551 * empty. 1552 */ 1553 error = xfs_alloc_cur_setup(args, &acur); 1554 if (error == -ENOSPC) { 1555 error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno, 1556 &len, &i); 1557 if (error) 1558 goto out; 1559 if (i == 0 || len == 0) { 1560 trace_xfs_alloc_near_noentry(args); 1561 goto out; 1562 } 1563 ASSERT(i == 1); 1564 } else if (error) { 1565 goto out; 1566 } 1567 1568 /* 1569 * First algorithm. 1570 * If the requested extent is large wrt the freespaces available 1571 * in this a.g., then the cursor will be pointing to a btree entry 1572 * near the right edge of the tree. If it's in the last btree leaf 1573 * block, then we just examine all the entries in that block 1574 * that are big enough, and pick the best one. 1575 */ 1576 if (xfs_btree_islastblock(acur.cnt, 0)) { 1577 bool allocated = false; 1578 1579 error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len, 1580 &allocated); 1581 if (error) 1582 goto out; 1583 if (allocated) 1584 goto alloc_finish; 1585 } 1586 1587 /* 1588 * Second algorithm. Combined cntbt and bnobt search to find ideal 1589 * locality. 1590 */ 1591 error = xfs_alloc_ag_vextent_locality(args, &acur, &i); 1592 if (error) 1593 goto out; 1594 1595 /* 1596 * If we couldn't get anything, give up. 1597 */ 1598 if (!acur.len) { 1599 if (acur.busy) { 1600 trace_xfs_alloc_near_busy(args); 1601 xfs_extent_busy_flush(args->mp, args->pag, 1602 acur.busy_gen); 1603 goto restart; 1604 } 1605 trace_xfs_alloc_size_neither(args); 1606 args->agbno = NULLAGBLOCK; 1607 goto out; 1608 } 1609 1610 alloc_finish: 1611 /* fix up btrees on a successful allocation */ 1612 error = xfs_alloc_cur_finish(args, &acur); 1613 1614 out: 1615 xfs_alloc_cur_close(&acur, error); 1616 return error; 1617 } 1618 1619 /* 1620 * Allocate a variable extent anywhere in the allocation group agno. 1621 * Extent's length (returned in len) will be between minlen and maxlen, 1622 * and of the form k * prod + mod unless there's nothing that large. 1623 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1624 */ 1625 STATIC int /* error */ 1626 xfs_alloc_ag_vextent_size( 1627 xfs_alloc_arg_t *args) /* allocation argument structure */ 1628 { 1629 struct xfs_agf *agf = args->agbp->b_addr; 1630 struct xfs_btree_cur *bno_cur; /* cursor for bno btree */ 1631 struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */ 1632 int error; /* error result */ 1633 xfs_agblock_t fbno; /* start of found freespace */ 1634 xfs_extlen_t flen; /* length of found freespace */ 1635 int i; /* temp status variable */ 1636 xfs_agblock_t rbno; /* returned block number */ 1637 xfs_extlen_t rlen; /* length of returned extent */ 1638 bool busy; 1639 unsigned busy_gen; 1640 1641 restart: 1642 /* 1643 * Allocate and initialize a cursor for the by-size btree. 1644 */ 1645 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1646 args->pag, XFS_BTNUM_CNT); 1647 bno_cur = NULL; 1648 1649 /* 1650 * Look for an entry >= maxlen+alignment-1 blocks. 1651 */ 1652 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, 1653 args->maxlen + args->alignment - 1, &i))) 1654 goto error0; 1655 1656 /* 1657 * If none then we have to settle for a smaller extent. In the case that 1658 * there are no large extents, this will return the last entry in the 1659 * tree unless the tree is empty. In the case that there are only busy 1660 * large extents, this will return the largest small extent unless there 1661 * are no smaller extents available. 1662 */ 1663 if (!i) { 1664 error = xfs_alloc_ag_vextent_small(args, cnt_cur, 1665 &fbno, &flen, &i); 1666 if (error) 1667 goto error0; 1668 if (i == 0 || flen == 0) { 1669 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1670 trace_xfs_alloc_size_noentry(args); 1671 return 0; 1672 } 1673 ASSERT(i == 1); 1674 busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, 1675 &rlen, &busy_gen); 1676 } else { 1677 /* 1678 * Search for a non-busy extent that is large enough. 1679 */ 1680 for (;;) { 1681 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); 1682 if (error) 1683 goto error0; 1684 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1685 error = -EFSCORRUPTED; 1686 goto error0; 1687 } 1688 1689 busy = xfs_alloc_compute_aligned(args, fbno, flen, 1690 &rbno, &rlen, &busy_gen); 1691 1692 if (rlen >= args->maxlen) 1693 break; 1694 1695 error = xfs_btree_increment(cnt_cur, 0, &i); 1696 if (error) 1697 goto error0; 1698 if (i == 0) { 1699 /* 1700 * Our only valid extents must have been busy. 1701 * Make it unbusy by forcing the log out and 1702 * retrying. 1703 */ 1704 xfs_btree_del_cursor(cnt_cur, 1705 XFS_BTREE_NOERROR); 1706 trace_xfs_alloc_size_busy(args); 1707 xfs_extent_busy_flush(args->mp, 1708 args->pag, busy_gen); 1709 goto restart; 1710 } 1711 } 1712 } 1713 1714 /* 1715 * In the first case above, we got the last entry in the 1716 * by-size btree. Now we check to see if the space hits maxlen 1717 * once aligned; if not, we search left for something better. 1718 * This can't happen in the second case above. 1719 */ 1720 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1721 if (XFS_IS_CORRUPT(args->mp, 1722 rlen != 0 && 1723 (rlen > flen || 1724 rbno + rlen > fbno + flen))) { 1725 error = -EFSCORRUPTED; 1726 goto error0; 1727 } 1728 if (rlen < args->maxlen) { 1729 xfs_agblock_t bestfbno; 1730 xfs_extlen_t bestflen; 1731 xfs_agblock_t bestrbno; 1732 xfs_extlen_t bestrlen; 1733 1734 bestrlen = rlen; 1735 bestrbno = rbno; 1736 bestflen = flen; 1737 bestfbno = fbno; 1738 for (;;) { 1739 if ((error = xfs_btree_decrement(cnt_cur, 0, &i))) 1740 goto error0; 1741 if (i == 0) 1742 break; 1743 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, 1744 &i))) 1745 goto error0; 1746 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1747 error = -EFSCORRUPTED; 1748 goto error0; 1749 } 1750 if (flen < bestrlen) 1751 break; 1752 busy = xfs_alloc_compute_aligned(args, fbno, flen, 1753 &rbno, &rlen, &busy_gen); 1754 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1755 if (XFS_IS_CORRUPT(args->mp, 1756 rlen != 0 && 1757 (rlen > flen || 1758 rbno + rlen > fbno + flen))) { 1759 error = -EFSCORRUPTED; 1760 goto error0; 1761 } 1762 if (rlen > bestrlen) { 1763 bestrlen = rlen; 1764 bestrbno = rbno; 1765 bestflen = flen; 1766 bestfbno = fbno; 1767 if (rlen == args->maxlen) 1768 break; 1769 } 1770 } 1771 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, 1772 &i))) 1773 goto error0; 1774 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1775 error = -EFSCORRUPTED; 1776 goto error0; 1777 } 1778 rlen = bestrlen; 1779 rbno = bestrbno; 1780 flen = bestflen; 1781 fbno = bestfbno; 1782 } 1783 args->wasfromfl = 0; 1784 /* 1785 * Fix up the length. 1786 */ 1787 args->len = rlen; 1788 if (rlen < args->minlen) { 1789 if (busy) { 1790 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1791 trace_xfs_alloc_size_busy(args); 1792 xfs_extent_busy_flush(args->mp, args->pag, busy_gen); 1793 goto restart; 1794 } 1795 goto out_nominleft; 1796 } 1797 xfs_alloc_fix_len(args); 1798 1799 rlen = args->len; 1800 if (XFS_IS_CORRUPT(args->mp, rlen > flen)) { 1801 error = -EFSCORRUPTED; 1802 goto error0; 1803 } 1804 /* 1805 * Allocate and initialize a cursor for the by-block tree. 1806 */ 1807 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp, 1808 args->pag, XFS_BTNUM_BNO); 1809 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, 1810 rbno, rlen, XFSA_FIXUP_CNT_OK))) 1811 goto error0; 1812 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1813 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1814 cnt_cur = bno_cur = NULL; 1815 args->len = rlen; 1816 args->agbno = rbno; 1817 if (XFS_IS_CORRUPT(args->mp, 1818 args->agbno + args->len > 1819 be32_to_cpu(agf->agf_length))) { 1820 error = -EFSCORRUPTED; 1821 goto error0; 1822 } 1823 trace_xfs_alloc_size_done(args); 1824 return 0; 1825 1826 error0: 1827 trace_xfs_alloc_size_error(args); 1828 if (cnt_cur) 1829 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 1830 if (bno_cur) 1831 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1832 return error; 1833 1834 out_nominleft: 1835 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1836 trace_xfs_alloc_size_nominleft(args); 1837 args->agbno = NULLAGBLOCK; 1838 return 0; 1839 } 1840 1841 /* 1842 * Free the extent starting at agno/bno for length. 1843 */ 1844 STATIC int 1845 xfs_free_ag_extent( 1846 struct xfs_trans *tp, 1847 struct xfs_buf *agbp, 1848 xfs_agnumber_t agno, 1849 xfs_agblock_t bno, 1850 xfs_extlen_t len, 1851 const struct xfs_owner_info *oinfo, 1852 enum xfs_ag_resv_type type) 1853 { 1854 struct xfs_mount *mp; 1855 struct xfs_btree_cur *bno_cur; 1856 struct xfs_btree_cur *cnt_cur; 1857 xfs_agblock_t gtbno; /* start of right neighbor */ 1858 xfs_extlen_t gtlen; /* length of right neighbor */ 1859 xfs_agblock_t ltbno; /* start of left neighbor */ 1860 xfs_extlen_t ltlen; /* length of left neighbor */ 1861 xfs_agblock_t nbno; /* new starting block of freesp */ 1862 xfs_extlen_t nlen; /* new length of freespace */ 1863 int haveleft; /* have a left neighbor */ 1864 int haveright; /* have a right neighbor */ 1865 int i; 1866 int error; 1867 struct xfs_perag *pag = agbp->b_pag; 1868 1869 bno_cur = cnt_cur = NULL; 1870 mp = tp->t_mountp; 1871 1872 if (!xfs_rmap_should_skip_owner_update(oinfo)) { 1873 error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo); 1874 if (error) 1875 goto error0; 1876 } 1877 1878 /* 1879 * Allocate and initialize a cursor for the by-block btree. 1880 */ 1881 bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO); 1882 /* 1883 * Look for a neighboring block on the left (lower block numbers) 1884 * that is contiguous with this space. 1885 */ 1886 if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft))) 1887 goto error0; 1888 if (haveleft) { 1889 /* 1890 * There is a block to our left. 1891 */ 1892 if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) 1893 goto error0; 1894 if (XFS_IS_CORRUPT(mp, i != 1)) { 1895 error = -EFSCORRUPTED; 1896 goto error0; 1897 } 1898 /* 1899 * It's not contiguous, though. 1900 */ 1901 if (ltbno + ltlen < bno) 1902 haveleft = 0; 1903 else { 1904 /* 1905 * If this failure happens the request to free this 1906 * space was invalid, it's (partly) already free. 1907 * Very bad. 1908 */ 1909 if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) { 1910 error = -EFSCORRUPTED; 1911 goto error0; 1912 } 1913 } 1914 } 1915 /* 1916 * Look for a neighboring block on the right (higher block numbers) 1917 * that is contiguous with this space. 1918 */ 1919 if ((error = xfs_btree_increment(bno_cur, 0, &haveright))) 1920 goto error0; 1921 if (haveright) { 1922 /* 1923 * There is a block to our right. 1924 */ 1925 if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) 1926 goto error0; 1927 if (XFS_IS_CORRUPT(mp, i != 1)) { 1928 error = -EFSCORRUPTED; 1929 goto error0; 1930 } 1931 /* 1932 * It's not contiguous, though. 1933 */ 1934 if (bno + len < gtbno) 1935 haveright = 0; 1936 else { 1937 /* 1938 * If this failure happens the request to free this 1939 * space was invalid, it's (partly) already free. 1940 * Very bad. 1941 */ 1942 if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) { 1943 error = -EFSCORRUPTED; 1944 goto error0; 1945 } 1946 } 1947 } 1948 /* 1949 * Now allocate and initialize a cursor for the by-size tree. 1950 */ 1951 cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT); 1952 /* 1953 * Have both left and right contiguous neighbors. 1954 * Merge all three into a single free block. 1955 */ 1956 if (haveleft && haveright) { 1957 /* 1958 * Delete the old by-size entry on the left. 1959 */ 1960 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 1961 goto error0; 1962 if (XFS_IS_CORRUPT(mp, i != 1)) { 1963 error = -EFSCORRUPTED; 1964 goto error0; 1965 } 1966 if ((error = xfs_btree_delete(cnt_cur, &i))) 1967 goto error0; 1968 if (XFS_IS_CORRUPT(mp, i != 1)) { 1969 error = -EFSCORRUPTED; 1970 goto error0; 1971 } 1972 /* 1973 * Delete the old by-size entry on the right. 1974 */ 1975 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 1976 goto error0; 1977 if (XFS_IS_CORRUPT(mp, i != 1)) { 1978 error = -EFSCORRUPTED; 1979 goto error0; 1980 } 1981 if ((error = xfs_btree_delete(cnt_cur, &i))) 1982 goto error0; 1983 if (XFS_IS_CORRUPT(mp, i != 1)) { 1984 error = -EFSCORRUPTED; 1985 goto error0; 1986 } 1987 /* 1988 * Delete the old by-block entry for the right block. 1989 */ 1990 if ((error = xfs_btree_delete(bno_cur, &i))) 1991 goto error0; 1992 if (XFS_IS_CORRUPT(mp, i != 1)) { 1993 error = -EFSCORRUPTED; 1994 goto error0; 1995 } 1996 /* 1997 * Move the by-block cursor back to the left neighbor. 1998 */ 1999 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 2000 goto error0; 2001 if (XFS_IS_CORRUPT(mp, i != 1)) { 2002 error = -EFSCORRUPTED; 2003 goto error0; 2004 } 2005 #ifdef DEBUG 2006 /* 2007 * Check that this is the right record: delete didn't 2008 * mangle the cursor. 2009 */ 2010 { 2011 xfs_agblock_t xxbno; 2012 xfs_extlen_t xxlen; 2013 2014 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, 2015 &i))) 2016 goto error0; 2017 if (XFS_IS_CORRUPT(mp, 2018 i != 1 || 2019 xxbno != ltbno || 2020 xxlen != ltlen)) { 2021 error = -EFSCORRUPTED; 2022 goto error0; 2023 } 2024 } 2025 #endif 2026 /* 2027 * Update remaining by-block entry to the new, joined block. 2028 */ 2029 nbno = ltbno; 2030 nlen = len + ltlen + gtlen; 2031 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2032 goto error0; 2033 } 2034 /* 2035 * Have only a left contiguous neighbor. 2036 * Merge it together with the new freespace. 2037 */ 2038 else if (haveleft) { 2039 /* 2040 * Delete the old by-size entry on the left. 2041 */ 2042 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 2043 goto error0; 2044 if (XFS_IS_CORRUPT(mp, i != 1)) { 2045 error = -EFSCORRUPTED; 2046 goto error0; 2047 } 2048 if ((error = xfs_btree_delete(cnt_cur, &i))) 2049 goto error0; 2050 if (XFS_IS_CORRUPT(mp, i != 1)) { 2051 error = -EFSCORRUPTED; 2052 goto error0; 2053 } 2054 /* 2055 * Back up the by-block cursor to the left neighbor, and 2056 * update its length. 2057 */ 2058 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 2059 goto error0; 2060 if (XFS_IS_CORRUPT(mp, i != 1)) { 2061 error = -EFSCORRUPTED; 2062 goto error0; 2063 } 2064 nbno = ltbno; 2065 nlen = len + ltlen; 2066 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2067 goto error0; 2068 } 2069 /* 2070 * Have only a right contiguous neighbor. 2071 * Merge it together with the new freespace. 2072 */ 2073 else if (haveright) { 2074 /* 2075 * Delete the old by-size entry on the right. 2076 */ 2077 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 2078 goto error0; 2079 if (XFS_IS_CORRUPT(mp, i != 1)) { 2080 error = -EFSCORRUPTED; 2081 goto error0; 2082 } 2083 if ((error = xfs_btree_delete(cnt_cur, &i))) 2084 goto error0; 2085 if (XFS_IS_CORRUPT(mp, i != 1)) { 2086 error = -EFSCORRUPTED; 2087 goto error0; 2088 } 2089 /* 2090 * Update the starting block and length of the right 2091 * neighbor in the by-block tree. 2092 */ 2093 nbno = bno; 2094 nlen = len + gtlen; 2095 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2096 goto error0; 2097 } 2098 /* 2099 * No contiguous neighbors. 2100 * Insert the new freespace into the by-block tree. 2101 */ 2102 else { 2103 nbno = bno; 2104 nlen = len; 2105 if ((error = xfs_btree_insert(bno_cur, &i))) 2106 goto error0; 2107 if (XFS_IS_CORRUPT(mp, i != 1)) { 2108 error = -EFSCORRUPTED; 2109 goto error0; 2110 } 2111 } 2112 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 2113 bno_cur = NULL; 2114 /* 2115 * In all cases we need to insert the new freespace in the by-size tree. 2116 */ 2117 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) 2118 goto error0; 2119 if (XFS_IS_CORRUPT(mp, i != 0)) { 2120 error = -EFSCORRUPTED; 2121 goto error0; 2122 } 2123 if ((error = xfs_btree_insert(cnt_cur, &i))) 2124 goto error0; 2125 if (XFS_IS_CORRUPT(mp, i != 1)) { 2126 error = -EFSCORRUPTED; 2127 goto error0; 2128 } 2129 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 2130 cnt_cur = NULL; 2131 2132 /* 2133 * Update the freespace totals in the ag and superblock. 2134 */ 2135 error = xfs_alloc_update_counters(tp, agbp, len); 2136 xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len); 2137 if (error) 2138 goto error0; 2139 2140 XFS_STATS_INC(mp, xs_freex); 2141 XFS_STATS_ADD(mp, xs_freeb, len); 2142 2143 trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright); 2144 2145 return 0; 2146 2147 error0: 2148 trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1); 2149 if (bno_cur) 2150 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 2151 if (cnt_cur) 2152 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 2153 return error; 2154 } 2155 2156 /* 2157 * Visible (exported) allocation/free functions. 2158 * Some of these are used just by xfs_alloc_btree.c and this file. 2159 */ 2160 2161 /* 2162 * Compute and fill in value of m_alloc_maxlevels. 2163 */ 2164 void 2165 xfs_alloc_compute_maxlevels( 2166 xfs_mount_t *mp) /* file system mount structure */ 2167 { 2168 mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr, 2169 (mp->m_sb.sb_agblocks + 1) / 2); 2170 ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk()); 2171 } 2172 2173 /* 2174 * Find the length of the longest extent in an AG. The 'need' parameter 2175 * specifies how much space we're going to need for the AGFL and the 2176 * 'reserved' parameter tells us how many blocks in this AG are reserved for 2177 * other callers. 2178 */ 2179 xfs_extlen_t 2180 xfs_alloc_longest_free_extent( 2181 struct xfs_perag *pag, 2182 xfs_extlen_t need, 2183 xfs_extlen_t reserved) 2184 { 2185 xfs_extlen_t delta = 0; 2186 2187 /* 2188 * If the AGFL needs a recharge, we'll have to subtract that from the 2189 * longest extent. 2190 */ 2191 if (need > pag->pagf_flcount) 2192 delta = need - pag->pagf_flcount; 2193 2194 /* 2195 * If we cannot maintain others' reservations with space from the 2196 * not-longest freesp extents, we'll have to subtract /that/ from 2197 * the longest extent too. 2198 */ 2199 if (pag->pagf_freeblks - pag->pagf_longest < reserved) 2200 delta += reserved - (pag->pagf_freeblks - pag->pagf_longest); 2201 2202 /* 2203 * If the longest extent is long enough to satisfy all the 2204 * reservations and AGFL rules in place, we can return this extent. 2205 */ 2206 if (pag->pagf_longest > delta) 2207 return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable, 2208 pag->pagf_longest - delta); 2209 2210 /* Otherwise, let the caller try for 1 block if there's space. */ 2211 return pag->pagf_flcount > 0 || pag->pagf_longest > 0; 2212 } 2213 2214 /* 2215 * Compute the minimum length of the AGFL in the given AG. If @pag is NULL, 2216 * return the largest possible minimum length. 2217 */ 2218 unsigned int 2219 xfs_alloc_min_freelist( 2220 struct xfs_mount *mp, 2221 struct xfs_perag *pag) 2222 { 2223 /* AG btrees have at least 1 level. */ 2224 static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1}; 2225 const uint8_t *levels = pag ? pag->pagf_levels : fake_levels; 2226 unsigned int min_free; 2227 2228 ASSERT(mp->m_alloc_maxlevels > 0); 2229 2230 /* space needed by-bno freespace btree */ 2231 min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, 2232 mp->m_alloc_maxlevels); 2233 /* space needed by-size freespace btree */ 2234 min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, 2235 mp->m_alloc_maxlevels); 2236 /* space needed reverse mapping used space btree */ 2237 if (xfs_has_rmapbt(mp)) 2238 min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, 2239 mp->m_rmap_maxlevels); 2240 2241 return min_free; 2242 } 2243 2244 /* 2245 * Check if the operation we are fixing up the freelist for should go ahead or 2246 * not. If we are freeing blocks, we always allow it, otherwise the allocation 2247 * is dependent on whether the size and shape of free space available will 2248 * permit the requested allocation to take place. 2249 */ 2250 static bool 2251 xfs_alloc_space_available( 2252 struct xfs_alloc_arg *args, 2253 xfs_extlen_t min_free, 2254 int flags) 2255 { 2256 struct xfs_perag *pag = args->pag; 2257 xfs_extlen_t alloc_len, longest; 2258 xfs_extlen_t reservation; /* blocks that are still reserved */ 2259 int available; 2260 xfs_extlen_t agflcount; 2261 2262 if (flags & XFS_ALLOC_FLAG_FREEING) 2263 return true; 2264 2265 reservation = xfs_ag_resv_needed(pag, args->resv); 2266 2267 /* do we have enough contiguous free space for the allocation? */ 2268 alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop; 2269 longest = xfs_alloc_longest_free_extent(pag, min_free, reservation); 2270 if (longest < alloc_len) 2271 return false; 2272 2273 /* 2274 * Do we have enough free space remaining for the allocation? Don't 2275 * account extra agfl blocks because we are about to defer free them, 2276 * making them unavailable until the current transaction commits. 2277 */ 2278 agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free); 2279 available = (int)(pag->pagf_freeblks + agflcount - 2280 reservation - min_free - args->minleft); 2281 if (available < (int)max(args->total, alloc_len)) 2282 return false; 2283 2284 /* 2285 * Clamp maxlen to the amount of free space available for the actual 2286 * extent allocation. 2287 */ 2288 if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) { 2289 args->maxlen = available; 2290 ASSERT(args->maxlen > 0); 2291 ASSERT(args->maxlen >= args->minlen); 2292 } 2293 2294 return true; 2295 } 2296 2297 int 2298 xfs_free_agfl_block( 2299 struct xfs_trans *tp, 2300 xfs_agnumber_t agno, 2301 xfs_agblock_t agbno, 2302 struct xfs_buf *agbp, 2303 struct xfs_owner_info *oinfo) 2304 { 2305 int error; 2306 struct xfs_buf *bp; 2307 2308 error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo, 2309 XFS_AG_RESV_AGFL); 2310 if (error) 2311 return error; 2312 2313 error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp, 2314 XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno), 2315 tp->t_mountp->m_bsize, 0, &bp); 2316 if (error) 2317 return error; 2318 xfs_trans_binval(tp, bp); 2319 2320 return 0; 2321 } 2322 2323 /* 2324 * Check the agfl fields of the agf for inconsistency or corruption. The purpose 2325 * is to detect an agfl header padding mismatch between current and early v5 2326 * kernels. This problem manifests as a 1-slot size difference between the 2327 * on-disk flcount and the active [first, last] range of a wrapped agfl. This 2328 * may also catch variants of agfl count corruption unrelated to padding. Either 2329 * way, we'll reset the agfl and warn the user. 2330 * 2331 * Return true if a reset is required before the agfl can be used, false 2332 * otherwise. 2333 */ 2334 static bool 2335 xfs_agfl_needs_reset( 2336 struct xfs_mount *mp, 2337 struct xfs_agf *agf) 2338 { 2339 uint32_t f = be32_to_cpu(agf->agf_flfirst); 2340 uint32_t l = be32_to_cpu(agf->agf_fllast); 2341 uint32_t c = be32_to_cpu(agf->agf_flcount); 2342 int agfl_size = xfs_agfl_size(mp); 2343 int active; 2344 2345 /* no agfl header on v4 supers */ 2346 if (!xfs_has_crc(mp)) 2347 return false; 2348 2349 /* 2350 * The agf read verifier catches severe corruption of these fields. 2351 * Repeat some sanity checks to cover a packed -> unpacked mismatch if 2352 * the verifier allows it. 2353 */ 2354 if (f >= agfl_size || l >= agfl_size) 2355 return true; 2356 if (c > agfl_size) 2357 return true; 2358 2359 /* 2360 * Check consistency between the on-disk count and the active range. An 2361 * agfl padding mismatch manifests as an inconsistent flcount. 2362 */ 2363 if (c && l >= f) 2364 active = l - f + 1; 2365 else if (c) 2366 active = agfl_size - f + l + 1; 2367 else 2368 active = 0; 2369 2370 return active != c; 2371 } 2372 2373 /* 2374 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the 2375 * agfl content cannot be trusted. Warn the user that a repair is required to 2376 * recover leaked blocks. 2377 * 2378 * The purpose of this mechanism is to handle filesystems affected by the agfl 2379 * header padding mismatch problem. A reset keeps the filesystem online with a 2380 * relatively minor free space accounting inconsistency rather than suffer the 2381 * inevitable crash from use of an invalid agfl block. 2382 */ 2383 static void 2384 xfs_agfl_reset( 2385 struct xfs_trans *tp, 2386 struct xfs_buf *agbp, 2387 struct xfs_perag *pag) 2388 { 2389 struct xfs_mount *mp = tp->t_mountp; 2390 struct xfs_agf *agf = agbp->b_addr; 2391 2392 ASSERT(xfs_perag_agfl_needs_reset(pag)); 2393 trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); 2394 2395 xfs_warn(mp, 2396 "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. " 2397 "Please unmount and run xfs_repair.", 2398 pag->pag_agno, pag->pagf_flcount); 2399 2400 agf->agf_flfirst = 0; 2401 agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1); 2402 agf->agf_flcount = 0; 2403 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST | 2404 XFS_AGF_FLCOUNT); 2405 2406 pag->pagf_flcount = 0; 2407 clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); 2408 } 2409 2410 /* 2411 * Defer an AGFL block free. This is effectively equivalent to 2412 * xfs_free_extent_later() with some special handling particular to AGFL blocks. 2413 * 2414 * Deferring AGFL frees helps prevent log reservation overruns due to too many 2415 * allocation operations in a transaction. AGFL frees are prone to this problem 2416 * because for one they are always freed one at a time. Further, an immediate 2417 * AGFL block free can cause a btree join and require another block free before 2418 * the real allocation can proceed. Deferring the free disconnects freeing up 2419 * the AGFL slot from freeing the block. 2420 */ 2421 STATIC void 2422 xfs_defer_agfl_block( 2423 struct xfs_trans *tp, 2424 xfs_agnumber_t agno, 2425 xfs_fsblock_t agbno, 2426 struct xfs_owner_info *oinfo) 2427 { 2428 struct xfs_mount *mp = tp->t_mountp; 2429 struct xfs_extent_free_item *xefi; 2430 2431 ASSERT(xfs_extfree_item_cache != NULL); 2432 ASSERT(oinfo != NULL); 2433 2434 xefi = kmem_cache_zalloc(xfs_extfree_item_cache, 2435 GFP_KERNEL | __GFP_NOFAIL); 2436 xefi->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno); 2437 xefi->xefi_blockcount = 1; 2438 xefi->xefi_owner = oinfo->oi_owner; 2439 2440 trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); 2441 2442 xfs_extent_free_get_group(mp, xefi); 2443 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &xefi->xefi_list); 2444 } 2445 2446 /* 2447 * Add the extent to the list of extents to be free at transaction end. 2448 * The list is maintained sorted (by block number). 2449 */ 2450 void 2451 __xfs_free_extent_later( 2452 struct xfs_trans *tp, 2453 xfs_fsblock_t bno, 2454 xfs_filblks_t len, 2455 const struct xfs_owner_info *oinfo, 2456 bool skip_discard) 2457 { 2458 struct xfs_extent_free_item *xefi; 2459 struct xfs_mount *mp = tp->t_mountp; 2460 #ifdef DEBUG 2461 xfs_agnumber_t agno; 2462 xfs_agblock_t agbno; 2463 2464 ASSERT(bno != NULLFSBLOCK); 2465 ASSERT(len > 0); 2466 ASSERT(len <= XFS_MAX_BMBT_EXTLEN); 2467 ASSERT(!isnullstartblock(bno)); 2468 agno = XFS_FSB_TO_AGNO(mp, bno); 2469 agbno = XFS_FSB_TO_AGBNO(mp, bno); 2470 ASSERT(agno < mp->m_sb.sb_agcount); 2471 ASSERT(agbno < mp->m_sb.sb_agblocks); 2472 ASSERT(len < mp->m_sb.sb_agblocks); 2473 ASSERT(agbno + len <= mp->m_sb.sb_agblocks); 2474 #endif 2475 ASSERT(xfs_extfree_item_cache != NULL); 2476 2477 xefi = kmem_cache_zalloc(xfs_extfree_item_cache, 2478 GFP_KERNEL | __GFP_NOFAIL); 2479 xefi->xefi_startblock = bno; 2480 xefi->xefi_blockcount = (xfs_extlen_t)len; 2481 if (skip_discard) 2482 xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; 2483 if (oinfo) { 2484 ASSERT(oinfo->oi_offset == 0); 2485 2486 if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) 2487 xefi->xefi_flags |= XFS_EFI_ATTR_FORK; 2488 if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) 2489 xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK; 2490 xefi->xefi_owner = oinfo->oi_owner; 2491 } else { 2492 xefi->xefi_owner = XFS_RMAP_OWN_NULL; 2493 } 2494 trace_xfs_bmap_free_defer(mp, 2495 XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, 2496 XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); 2497 2498 xfs_extent_free_get_group(mp, xefi); 2499 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &xefi->xefi_list); 2500 } 2501 2502 #ifdef DEBUG 2503 /* 2504 * Check if an AGF has a free extent record whose length is equal to 2505 * args->minlen. 2506 */ 2507 STATIC int 2508 xfs_exact_minlen_extent_available( 2509 struct xfs_alloc_arg *args, 2510 struct xfs_buf *agbp, 2511 int *stat) 2512 { 2513 struct xfs_btree_cur *cnt_cur; 2514 xfs_agblock_t fbno; 2515 xfs_extlen_t flen; 2516 int error = 0; 2517 2518 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp, 2519 args->pag, XFS_BTNUM_CNT); 2520 error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat); 2521 if (error) 2522 goto out; 2523 2524 if (*stat == 0) { 2525 error = -EFSCORRUPTED; 2526 goto out; 2527 } 2528 2529 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat); 2530 if (error) 2531 goto out; 2532 2533 if (*stat == 1 && flen != args->minlen) 2534 *stat = 0; 2535 2536 out: 2537 xfs_btree_del_cursor(cnt_cur, error); 2538 2539 return error; 2540 } 2541 #endif 2542 2543 /* 2544 * Decide whether to use this allocation group for this allocation. 2545 * If so, fix up the btree freelist's size. 2546 */ 2547 int /* error */ 2548 xfs_alloc_fix_freelist( 2549 struct xfs_alloc_arg *args, /* allocation argument structure */ 2550 int flags) /* XFS_ALLOC_FLAG_... */ 2551 { 2552 struct xfs_mount *mp = args->mp; 2553 struct xfs_perag *pag = args->pag; 2554 struct xfs_trans *tp = args->tp; 2555 struct xfs_buf *agbp = NULL; 2556 struct xfs_buf *agflbp = NULL; 2557 struct xfs_alloc_arg targs; /* local allocation arguments */ 2558 xfs_agblock_t bno; /* freelist block */ 2559 xfs_extlen_t need; /* total blocks needed in freelist */ 2560 int error = 0; 2561 2562 /* deferred ops (AGFL block frees) require permanent transactions */ 2563 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 2564 2565 if (!xfs_perag_initialised_agf(pag)) { 2566 error = xfs_alloc_read_agf(pag, tp, flags, &agbp); 2567 if (error) { 2568 /* Couldn't lock the AGF so skip this AG. */ 2569 if (error == -EAGAIN) 2570 error = 0; 2571 goto out_no_agbp; 2572 } 2573 } 2574 2575 /* 2576 * If this is a metadata preferred pag and we are user data then try 2577 * somewhere else if we are not being asked to try harder at this 2578 * point 2579 */ 2580 if (xfs_perag_prefers_metadata(pag) && 2581 (args->datatype & XFS_ALLOC_USERDATA) && 2582 (flags & XFS_ALLOC_FLAG_TRYLOCK)) { 2583 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING)); 2584 goto out_agbp_relse; 2585 } 2586 2587 need = xfs_alloc_min_freelist(mp, pag); 2588 if (!xfs_alloc_space_available(args, need, flags | 2589 XFS_ALLOC_FLAG_CHECK)) 2590 goto out_agbp_relse; 2591 2592 /* 2593 * Get the a.g. freespace buffer. 2594 * Can fail if we're not blocking on locks, and it's held. 2595 */ 2596 if (!agbp) { 2597 error = xfs_alloc_read_agf(pag, tp, flags, &agbp); 2598 if (error) { 2599 /* Couldn't lock the AGF so skip this AG. */ 2600 if (error == -EAGAIN) 2601 error = 0; 2602 goto out_no_agbp; 2603 } 2604 } 2605 2606 /* reset a padding mismatched agfl before final free space check */ 2607 if (xfs_perag_agfl_needs_reset(pag)) 2608 xfs_agfl_reset(tp, agbp, pag); 2609 2610 /* If there isn't enough total space or single-extent, reject it. */ 2611 need = xfs_alloc_min_freelist(mp, pag); 2612 if (!xfs_alloc_space_available(args, need, flags)) 2613 goto out_agbp_relse; 2614 2615 #ifdef DEBUG 2616 if (args->alloc_minlen_only) { 2617 int stat; 2618 2619 error = xfs_exact_minlen_extent_available(args, agbp, &stat); 2620 if (error || !stat) 2621 goto out_agbp_relse; 2622 } 2623 #endif 2624 /* 2625 * Make the freelist shorter if it's too long. 2626 * 2627 * Note that from this point onwards, we will always release the agf and 2628 * agfl buffers on error. This handles the case where we error out and 2629 * the buffers are clean or may not have been joined to the transaction 2630 * and hence need to be released manually. If they have been joined to 2631 * the transaction, then xfs_trans_brelse() will handle them 2632 * appropriately based on the recursion count and dirty state of the 2633 * buffer. 2634 * 2635 * XXX (dgc): When we have lots of free space, does this buy us 2636 * anything other than extra overhead when we need to put more blocks 2637 * back on the free list? Maybe we should only do this when space is 2638 * getting low or the AGFL is more than half full? 2639 * 2640 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too 2641 * big; the NORMAP flag prevents AGFL expand/shrink operations from 2642 * updating the rmapbt. Both flags are used in xfs_repair while we're 2643 * rebuilding the rmapbt, and neither are used by the kernel. They're 2644 * both required to ensure that rmaps are correctly recorded for the 2645 * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and 2646 * repair/rmap.c in xfsprogs for details. 2647 */ 2648 memset(&targs, 0, sizeof(targs)); 2649 /* struct copy below */ 2650 if (flags & XFS_ALLOC_FLAG_NORMAP) 2651 targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; 2652 else 2653 targs.oinfo = XFS_RMAP_OINFO_AG; 2654 while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) { 2655 error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0); 2656 if (error) 2657 goto out_agbp_relse; 2658 2659 /* defer agfl frees */ 2660 xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); 2661 } 2662 2663 targs.tp = tp; 2664 targs.mp = mp; 2665 targs.agbp = agbp; 2666 targs.agno = args->agno; 2667 targs.alignment = targs.minlen = targs.prod = 1; 2668 targs.pag = pag; 2669 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 2670 if (error) 2671 goto out_agbp_relse; 2672 2673 /* Make the freelist longer if it's too short. */ 2674 while (pag->pagf_flcount < need) { 2675 targs.agbno = 0; 2676 targs.maxlen = need - pag->pagf_flcount; 2677 targs.resv = XFS_AG_RESV_AGFL; 2678 2679 /* Allocate as many blocks as possible at once. */ 2680 error = xfs_alloc_ag_vextent_size(&targs); 2681 if (error) 2682 goto out_agflbp_relse; 2683 2684 /* 2685 * Stop if we run out. Won't happen if callers are obeying 2686 * the restrictions correctly. Can happen for free calls 2687 * on a completely full ag. 2688 */ 2689 if (targs.agbno == NULLAGBLOCK) { 2690 if (flags & XFS_ALLOC_FLAG_FREEING) 2691 break; 2692 goto out_agflbp_relse; 2693 } 2694 2695 if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) { 2696 error = xfs_rmap_alloc(tp, agbp, pag, 2697 targs.agbno, targs.len, &targs.oinfo); 2698 if (error) 2699 goto out_agflbp_relse; 2700 } 2701 error = xfs_alloc_update_counters(tp, agbp, 2702 -((long)(targs.len))); 2703 if (error) 2704 goto out_agflbp_relse; 2705 2706 /* 2707 * Put each allocated block on the list. 2708 */ 2709 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { 2710 error = xfs_alloc_put_freelist(pag, tp, agbp, 2711 agflbp, bno, 0); 2712 if (error) 2713 goto out_agflbp_relse; 2714 } 2715 } 2716 xfs_trans_brelse(tp, agflbp); 2717 args->agbp = agbp; 2718 return 0; 2719 2720 out_agflbp_relse: 2721 xfs_trans_brelse(tp, agflbp); 2722 out_agbp_relse: 2723 if (agbp) 2724 xfs_trans_brelse(tp, agbp); 2725 out_no_agbp: 2726 args->agbp = NULL; 2727 return error; 2728 } 2729 2730 /* 2731 * Get a block from the freelist. 2732 * Returns with the buffer for the block gotten. 2733 */ 2734 int 2735 xfs_alloc_get_freelist( 2736 struct xfs_perag *pag, 2737 struct xfs_trans *tp, 2738 struct xfs_buf *agbp, 2739 xfs_agblock_t *bnop, 2740 int btreeblk) 2741 { 2742 struct xfs_agf *agf = agbp->b_addr; 2743 struct xfs_buf *agflbp; 2744 xfs_agblock_t bno; 2745 __be32 *agfl_bno; 2746 int error; 2747 uint32_t logflags; 2748 struct xfs_mount *mp = tp->t_mountp; 2749 2750 /* 2751 * Freelist is empty, give up. 2752 */ 2753 if (!agf->agf_flcount) { 2754 *bnop = NULLAGBLOCK; 2755 return 0; 2756 } 2757 /* 2758 * Read the array of free blocks. 2759 */ 2760 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 2761 if (error) 2762 return error; 2763 2764 2765 /* 2766 * Get the block number and update the data structures. 2767 */ 2768 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 2769 bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); 2770 be32_add_cpu(&agf->agf_flfirst, 1); 2771 xfs_trans_brelse(tp, agflbp); 2772 if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) 2773 agf->agf_flfirst = 0; 2774 2775 ASSERT(!xfs_perag_agfl_needs_reset(pag)); 2776 be32_add_cpu(&agf->agf_flcount, -1); 2777 pag->pagf_flcount--; 2778 2779 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; 2780 if (btreeblk) { 2781 be32_add_cpu(&agf->agf_btreeblks, 1); 2782 pag->pagf_btreeblks++; 2783 logflags |= XFS_AGF_BTREEBLKS; 2784 } 2785 2786 xfs_alloc_log_agf(tp, agbp, logflags); 2787 *bnop = bno; 2788 2789 return 0; 2790 } 2791 2792 /* 2793 * Log the given fields from the agf structure. 2794 */ 2795 void 2796 xfs_alloc_log_agf( 2797 struct xfs_trans *tp, 2798 struct xfs_buf *bp, 2799 uint32_t fields) 2800 { 2801 int first; /* first byte offset */ 2802 int last; /* last byte offset */ 2803 static const short offsets[] = { 2804 offsetof(xfs_agf_t, agf_magicnum), 2805 offsetof(xfs_agf_t, agf_versionnum), 2806 offsetof(xfs_agf_t, agf_seqno), 2807 offsetof(xfs_agf_t, agf_length), 2808 offsetof(xfs_agf_t, agf_roots[0]), 2809 offsetof(xfs_agf_t, agf_levels[0]), 2810 offsetof(xfs_agf_t, agf_flfirst), 2811 offsetof(xfs_agf_t, agf_fllast), 2812 offsetof(xfs_agf_t, agf_flcount), 2813 offsetof(xfs_agf_t, agf_freeblks), 2814 offsetof(xfs_agf_t, agf_longest), 2815 offsetof(xfs_agf_t, agf_btreeblks), 2816 offsetof(xfs_agf_t, agf_uuid), 2817 offsetof(xfs_agf_t, agf_rmap_blocks), 2818 offsetof(xfs_agf_t, agf_refcount_blocks), 2819 offsetof(xfs_agf_t, agf_refcount_root), 2820 offsetof(xfs_agf_t, agf_refcount_level), 2821 /* needed so that we don't log the whole rest of the structure: */ 2822 offsetof(xfs_agf_t, agf_spare64), 2823 sizeof(xfs_agf_t) 2824 }; 2825 2826 trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_); 2827 2828 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); 2829 2830 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); 2831 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); 2832 } 2833 2834 /* 2835 * Put the block on the freelist for the allocation group. 2836 */ 2837 int 2838 xfs_alloc_put_freelist( 2839 struct xfs_perag *pag, 2840 struct xfs_trans *tp, 2841 struct xfs_buf *agbp, 2842 struct xfs_buf *agflbp, 2843 xfs_agblock_t bno, 2844 int btreeblk) 2845 { 2846 struct xfs_mount *mp = tp->t_mountp; 2847 struct xfs_agf *agf = agbp->b_addr; 2848 __be32 *blockp; 2849 int error; 2850 uint32_t logflags; 2851 __be32 *agfl_bno; 2852 int startoff; 2853 2854 if (!agflbp) { 2855 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 2856 if (error) 2857 return error; 2858 } 2859 2860 be32_add_cpu(&agf->agf_fllast, 1); 2861 if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) 2862 agf->agf_fllast = 0; 2863 2864 ASSERT(!xfs_perag_agfl_needs_reset(pag)); 2865 be32_add_cpu(&agf->agf_flcount, 1); 2866 pag->pagf_flcount++; 2867 2868 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; 2869 if (btreeblk) { 2870 be32_add_cpu(&agf->agf_btreeblks, -1); 2871 pag->pagf_btreeblks--; 2872 logflags |= XFS_AGF_BTREEBLKS; 2873 } 2874 2875 xfs_alloc_log_agf(tp, agbp, logflags); 2876 2877 ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); 2878 2879 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 2880 blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; 2881 *blockp = cpu_to_be32(bno); 2882 startoff = (char *)blockp - (char *)agflbp->b_addr; 2883 2884 xfs_alloc_log_agf(tp, agbp, logflags); 2885 2886 xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF); 2887 xfs_trans_log_buf(tp, agflbp, startoff, 2888 startoff + sizeof(xfs_agblock_t) - 1); 2889 return 0; 2890 } 2891 2892 static xfs_failaddr_t 2893 xfs_agf_verify( 2894 struct xfs_buf *bp) 2895 { 2896 struct xfs_mount *mp = bp->b_mount; 2897 struct xfs_agf *agf = bp->b_addr; 2898 2899 if (xfs_has_crc(mp)) { 2900 if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) 2901 return __this_address; 2902 if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn))) 2903 return __this_address; 2904 } 2905 2906 if (!xfs_verify_magic(bp, agf->agf_magicnum)) 2907 return __this_address; 2908 2909 if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) && 2910 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) && 2911 be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) && 2912 be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) && 2913 be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp))) 2914 return __this_address; 2915 2916 if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks) 2917 return __this_address; 2918 2919 if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) || 2920 be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length)) 2921 return __this_address; 2922 2923 if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 || 2924 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 || 2925 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > 2926 mp->m_alloc_maxlevels || 2927 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > 2928 mp->m_alloc_maxlevels) 2929 return __this_address; 2930 2931 if (xfs_has_rmapbt(mp) && 2932 (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 || 2933 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) > 2934 mp->m_rmap_maxlevels)) 2935 return __this_address; 2936 2937 if (xfs_has_rmapbt(mp) && 2938 be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length)) 2939 return __this_address; 2940 2941 /* 2942 * during growfs operations, the perag is not fully initialised, 2943 * so we can't use it for any useful checking. growfs ensures we can't 2944 * use it by using uncached buffers that don't have the perag attached 2945 * so we can detect and avoid this problem. 2946 */ 2947 if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno) 2948 return __this_address; 2949 2950 if (xfs_has_lazysbcount(mp) && 2951 be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length)) 2952 return __this_address; 2953 2954 if (xfs_has_reflink(mp) && 2955 be32_to_cpu(agf->agf_refcount_blocks) > 2956 be32_to_cpu(agf->agf_length)) 2957 return __this_address; 2958 2959 if (xfs_has_reflink(mp) && 2960 (be32_to_cpu(agf->agf_refcount_level) < 1 || 2961 be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels)) 2962 return __this_address; 2963 2964 return NULL; 2965 2966 } 2967 2968 static void 2969 xfs_agf_read_verify( 2970 struct xfs_buf *bp) 2971 { 2972 struct xfs_mount *mp = bp->b_mount; 2973 xfs_failaddr_t fa; 2974 2975 if (xfs_has_crc(mp) && 2976 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) 2977 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 2978 else { 2979 fa = xfs_agf_verify(bp); 2980 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF)) 2981 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 2982 } 2983 } 2984 2985 static void 2986 xfs_agf_write_verify( 2987 struct xfs_buf *bp) 2988 { 2989 struct xfs_mount *mp = bp->b_mount; 2990 struct xfs_buf_log_item *bip = bp->b_log_item; 2991 struct xfs_agf *agf = bp->b_addr; 2992 xfs_failaddr_t fa; 2993 2994 fa = xfs_agf_verify(bp); 2995 if (fa) { 2996 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 2997 return; 2998 } 2999 3000 if (!xfs_has_crc(mp)) 3001 return; 3002 3003 if (bip) 3004 agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); 3005 3006 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); 3007 } 3008 3009 const struct xfs_buf_ops xfs_agf_buf_ops = { 3010 .name = "xfs_agf", 3011 .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) }, 3012 .verify_read = xfs_agf_read_verify, 3013 .verify_write = xfs_agf_write_verify, 3014 .verify_struct = xfs_agf_verify, 3015 }; 3016 3017 /* 3018 * Read in the allocation group header (free/alloc section). 3019 */ 3020 int 3021 xfs_read_agf( 3022 struct xfs_perag *pag, 3023 struct xfs_trans *tp, 3024 int flags, 3025 struct xfs_buf **agfbpp) 3026 { 3027 struct xfs_mount *mp = pag->pag_mount; 3028 int error; 3029 3030 trace_xfs_read_agf(pag->pag_mount, pag->pag_agno); 3031 3032 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 3033 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)), 3034 XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); 3035 if (error) 3036 return error; 3037 3038 xfs_buf_set_ref(*agfbpp, XFS_AGF_REF); 3039 return 0; 3040 } 3041 3042 /* 3043 * Read in the allocation group header (free/alloc section) and initialise the 3044 * perag structure if necessary. If the caller provides @agfbpp, then return the 3045 * locked buffer to the caller, otherwise free it. 3046 */ 3047 int 3048 xfs_alloc_read_agf( 3049 struct xfs_perag *pag, 3050 struct xfs_trans *tp, 3051 int flags, 3052 struct xfs_buf **agfbpp) 3053 { 3054 struct xfs_buf *agfbp; 3055 struct xfs_agf *agf; 3056 int error; 3057 int allocbt_blks; 3058 3059 trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno); 3060 3061 /* We don't support trylock when freeing. */ 3062 ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != 3063 (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)); 3064 error = xfs_read_agf(pag, tp, 3065 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, 3066 &agfbp); 3067 if (error) 3068 return error; 3069 3070 agf = agfbp->b_addr; 3071 if (!xfs_perag_initialised_agf(pag)) { 3072 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); 3073 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); 3074 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); 3075 pag->pagf_longest = be32_to_cpu(agf->agf_longest); 3076 pag->pagf_levels[XFS_BTNUM_BNOi] = 3077 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]); 3078 pag->pagf_levels[XFS_BTNUM_CNTi] = 3079 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]); 3080 pag->pagf_levels[XFS_BTNUM_RMAPi] = 3081 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]); 3082 pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); 3083 if (xfs_agfl_needs_reset(pag->pag_mount, agf)) 3084 set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); 3085 else 3086 clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); 3087 3088 /* 3089 * Update the in-core allocbt counter. Filter out the rmapbt 3090 * subset of the btreeblks counter because the rmapbt is managed 3091 * by perag reservation. Subtract one for the rmapbt root block 3092 * because the rmap counter includes it while the btreeblks 3093 * counter only tracks non-root blocks. 3094 */ 3095 allocbt_blks = pag->pagf_btreeblks; 3096 if (xfs_has_rmapbt(pag->pag_mount)) 3097 allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; 3098 if (allocbt_blks > 0) 3099 atomic64_add(allocbt_blks, 3100 &pag->pag_mount->m_allocbt_blks); 3101 3102 set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); 3103 } 3104 #ifdef DEBUG 3105 else if (!xfs_is_shutdown(pag->pag_mount)) { 3106 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); 3107 ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); 3108 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); 3109 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); 3110 ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] == 3111 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi])); 3112 ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] == 3113 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi])); 3114 } 3115 #endif 3116 if (agfbpp) 3117 *agfbpp = agfbp; 3118 else 3119 xfs_trans_brelse(tp, agfbp); 3120 return 0; 3121 } 3122 3123 /* 3124 * Pre-proces allocation arguments to set initial state that we don't require 3125 * callers to set up correctly, as well as bounds check the allocation args 3126 * that are set up. 3127 */ 3128 static int 3129 xfs_alloc_vextent_check_args( 3130 struct xfs_alloc_arg *args, 3131 xfs_fsblock_t target, 3132 xfs_agnumber_t *minimum_agno) 3133 { 3134 struct xfs_mount *mp = args->mp; 3135 xfs_agblock_t agsize; 3136 3137 args->fsbno = NULLFSBLOCK; 3138 3139 *minimum_agno = 0; 3140 if (args->tp->t_highest_agno != NULLAGNUMBER) 3141 *minimum_agno = args->tp->t_highest_agno; 3142 3143 /* 3144 * Just fix this up, for the case where the last a.g. is shorter 3145 * (or there's only one a.g.) and the caller couldn't easily figure 3146 * that out (xfs_bmap_alloc). 3147 */ 3148 agsize = mp->m_sb.sb_agblocks; 3149 if (args->maxlen > agsize) 3150 args->maxlen = agsize; 3151 if (args->alignment == 0) 3152 args->alignment = 1; 3153 3154 ASSERT(args->minlen > 0); 3155 ASSERT(args->maxlen > 0); 3156 ASSERT(args->alignment > 0); 3157 ASSERT(args->resv != XFS_AG_RESV_AGFL); 3158 3159 ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount); 3160 ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize); 3161 ASSERT(args->minlen <= args->maxlen); 3162 ASSERT(args->minlen <= agsize); 3163 ASSERT(args->mod < args->prod); 3164 3165 if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount || 3166 XFS_FSB_TO_AGBNO(mp, target) >= agsize || 3167 args->minlen > args->maxlen || args->minlen > agsize || 3168 args->mod >= args->prod) { 3169 trace_xfs_alloc_vextent_badargs(args); 3170 return -ENOSPC; 3171 } 3172 3173 if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) { 3174 trace_xfs_alloc_vextent_skip_deadlock(args); 3175 return -ENOSPC; 3176 } 3177 return 0; 3178 3179 } 3180 3181 /* 3182 * Prepare an AG for allocation. If the AG is not prepared to accept the 3183 * allocation, return failure. 3184 * 3185 * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are 3186 * modified to hold their own perag references. 3187 */ 3188 static int 3189 xfs_alloc_vextent_prepare_ag( 3190 struct xfs_alloc_arg *args) 3191 { 3192 bool need_pag = !args->pag; 3193 int error; 3194 3195 if (need_pag) 3196 args->pag = xfs_perag_get(args->mp, args->agno); 3197 3198 args->agbp = NULL; 3199 error = xfs_alloc_fix_freelist(args, 0); 3200 if (error) { 3201 trace_xfs_alloc_vextent_nofix(args); 3202 if (need_pag) 3203 xfs_perag_put(args->pag); 3204 args->agbno = NULLAGBLOCK; 3205 return error; 3206 } 3207 if (!args->agbp) { 3208 /* cannot allocate in this AG at all */ 3209 trace_xfs_alloc_vextent_noagbp(args); 3210 args->agbno = NULLAGBLOCK; 3211 return 0; 3212 } 3213 args->wasfromfl = 0; 3214 return 0; 3215 } 3216 3217 /* 3218 * Post-process allocation results to account for the allocation if it succeed 3219 * and set the allocated block number correctly for the caller. 3220 * 3221 * XXX: we should really be returning ENOSPC for ENOSPC, not 3222 * hiding it behind a "successful" NULLFSBLOCK allocation. 3223 */ 3224 static int 3225 xfs_alloc_vextent_finish( 3226 struct xfs_alloc_arg *args, 3227 xfs_agnumber_t minimum_agno, 3228 int alloc_error, 3229 bool drop_perag) 3230 { 3231 struct xfs_mount *mp = args->mp; 3232 int error = 0; 3233 3234 /* 3235 * We can end up here with a locked AGF. If we failed, the caller is 3236 * likely going to try to allocate again with different parameters, and 3237 * that can widen the AGs that are searched for free space. If we have 3238 * to do BMBT block allocation, we have to do a new allocation. 3239 * 3240 * Hence leaving this function with the AGF locked opens up potential 3241 * ABBA AGF deadlocks because a future allocation attempt in this 3242 * transaction may attempt to lock a lower number AGF. 3243 * 3244 * We can't release the AGF until the transaction is commited, so at 3245 * this point we must update the "first allocation" tracker to point at 3246 * this AG if the tracker is empty or points to a lower AG. This allows 3247 * the next allocation attempt to be modified appropriately to avoid 3248 * deadlocks. 3249 */ 3250 if (args->agbp && 3251 (args->tp->t_highest_agno == NULLAGNUMBER || 3252 args->agno > minimum_agno)) 3253 args->tp->t_highest_agno = args->agno; 3254 3255 /* 3256 * If the allocation failed with an error or we had an ENOSPC result, 3257 * preserve the returned error whilst also marking the allocation result 3258 * as "no extent allocated". This ensures that callers that fail to 3259 * capture the error will still treat it as a failed allocation. 3260 */ 3261 if (alloc_error || args->agbno == NULLAGBLOCK) { 3262 args->fsbno = NULLFSBLOCK; 3263 error = alloc_error; 3264 goto out_drop_perag; 3265 } 3266 3267 args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); 3268 3269 ASSERT(args->len >= args->minlen); 3270 ASSERT(args->len <= args->maxlen); 3271 ASSERT(args->agbno % args->alignment == 0); 3272 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len); 3273 3274 /* if not file data, insert new block into the reverse map btree */ 3275 if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { 3276 error = xfs_rmap_alloc(args->tp, args->agbp, args->pag, 3277 args->agbno, args->len, &args->oinfo); 3278 if (error) 3279 goto out_drop_perag; 3280 } 3281 3282 if (!args->wasfromfl) { 3283 error = xfs_alloc_update_counters(args->tp, args->agbp, 3284 -((long)(args->len))); 3285 if (error) 3286 goto out_drop_perag; 3287 3288 ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno, 3289 args->len)); 3290 } 3291 3292 xfs_ag_resv_alloc_extent(args->pag, args->resv, args); 3293 3294 XFS_STATS_INC(mp, xs_allocx); 3295 XFS_STATS_ADD(mp, xs_allocb, args->len); 3296 3297 trace_xfs_alloc_vextent_finish(args); 3298 3299 out_drop_perag: 3300 if (drop_perag && args->pag) { 3301 xfs_perag_rele(args->pag); 3302 args->pag = NULL; 3303 } 3304 return error; 3305 } 3306 3307 /* 3308 * Allocate within a single AG only. This uses a best-fit length algorithm so if 3309 * you need an exact sized allocation without locality constraints, this is the 3310 * fastest way to do it. 3311 * 3312 * Caller is expected to hold a perag reference in args->pag. 3313 */ 3314 int 3315 xfs_alloc_vextent_this_ag( 3316 struct xfs_alloc_arg *args, 3317 xfs_agnumber_t agno) 3318 { 3319 struct xfs_mount *mp = args->mp; 3320 xfs_agnumber_t minimum_agno; 3321 int error; 3322 3323 ASSERT(args->pag != NULL); 3324 ASSERT(args->pag->pag_agno == agno); 3325 3326 args->agno = agno; 3327 args->agbno = 0; 3328 3329 trace_xfs_alloc_vextent_this_ag(args); 3330 3331 error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0), 3332 &minimum_agno); 3333 if (error) { 3334 if (error == -ENOSPC) 3335 return 0; 3336 return error; 3337 } 3338 3339 error = xfs_alloc_vextent_prepare_ag(args); 3340 if (!error && args->agbp) 3341 error = xfs_alloc_ag_vextent_size(args); 3342 3343 return xfs_alloc_vextent_finish(args, minimum_agno, error, false); 3344 } 3345 3346 /* 3347 * Iterate all AGs trying to allocate an extent starting from @start_ag. 3348 * 3349 * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the 3350 * allocation attempts in @start_agno have locality information. If we fail to 3351 * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs 3352 * we attempt to allocation in as there is no locality optimisation possible for 3353 * those allocations. 3354 * 3355 * On return, args->pag may be left referenced if we finish before the "all 3356 * failed" return point. The allocation finish still needs the perag, and 3357 * so the caller will release it once they've finished the allocation. 3358 * 3359 * When we wrap the AG iteration at the end of the filesystem, we have to be 3360 * careful not to wrap into AGs below ones we already have locked in the 3361 * transaction if we are doing a blocking iteration. This will result in an 3362 * out-of-order locking of AGFs and hence can cause deadlocks. 3363 */ 3364 static int 3365 xfs_alloc_vextent_iterate_ags( 3366 struct xfs_alloc_arg *args, 3367 xfs_agnumber_t minimum_agno, 3368 xfs_agnumber_t start_agno, 3369 xfs_agblock_t target_agbno, 3370 uint32_t flags) 3371 { 3372 struct xfs_mount *mp = args->mp; 3373 xfs_agnumber_t restart_agno = minimum_agno; 3374 xfs_agnumber_t agno; 3375 int error = 0; 3376 3377 if (flags & XFS_ALLOC_FLAG_TRYLOCK) 3378 restart_agno = 0; 3379 restart: 3380 for_each_perag_wrap_range(mp, start_agno, restart_agno, 3381 mp->m_sb.sb_agcount, agno, args->pag) { 3382 args->agno = agno; 3383 error = xfs_alloc_vextent_prepare_ag(args); 3384 if (error) 3385 break; 3386 if (!args->agbp) { 3387 trace_xfs_alloc_vextent_loopfailed(args); 3388 continue; 3389 } 3390 3391 /* 3392 * Allocation is supposed to succeed now, so break out of the 3393 * loop regardless of whether we succeed or not. 3394 */ 3395 if (args->agno == start_agno && target_agbno) { 3396 args->agbno = target_agbno; 3397 error = xfs_alloc_ag_vextent_near(args); 3398 } else { 3399 args->agbno = 0; 3400 error = xfs_alloc_ag_vextent_size(args); 3401 } 3402 break; 3403 } 3404 if (error) { 3405 xfs_perag_rele(args->pag); 3406 args->pag = NULL; 3407 return error; 3408 } 3409 if (args->agbp) 3410 return 0; 3411 3412 /* 3413 * We didn't find an AG we can alloation from. If we were given 3414 * constraining flags by the caller, drop them and retry the allocation 3415 * without any constraints being set. 3416 */ 3417 if (flags) { 3418 flags = 0; 3419 restart_agno = minimum_agno; 3420 goto restart; 3421 } 3422 3423 ASSERT(args->pag == NULL); 3424 trace_xfs_alloc_vextent_allfailed(args); 3425 return 0; 3426 } 3427 3428 /* 3429 * Iterate from the AGs from the start AG to the end of the filesystem, trying 3430 * to allocate blocks. It starts with a near allocation attempt in the initial 3431 * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap 3432 * back to zero if allowed by previous allocations in this transaction, 3433 * otherwise will wrap back to the start AG and run a second blocking pass to 3434 * the end of the filesystem. 3435 */ 3436 int 3437 xfs_alloc_vextent_start_ag( 3438 struct xfs_alloc_arg *args, 3439 xfs_fsblock_t target) 3440 { 3441 struct xfs_mount *mp = args->mp; 3442 xfs_agnumber_t minimum_agno; 3443 xfs_agnumber_t start_agno; 3444 xfs_agnumber_t rotorstep = xfs_rotorstep; 3445 bool bump_rotor = false; 3446 int error; 3447 3448 ASSERT(args->pag == NULL); 3449 3450 args->agno = NULLAGNUMBER; 3451 args->agbno = NULLAGBLOCK; 3452 3453 trace_xfs_alloc_vextent_start_ag(args); 3454 3455 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3456 if (error) { 3457 if (error == -ENOSPC) 3458 return 0; 3459 return error; 3460 } 3461 3462 if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && 3463 xfs_is_inode32(mp)) { 3464 target = XFS_AGB_TO_FSB(mp, 3465 ((mp->m_agfrotor / rotorstep) % 3466 mp->m_sb.sb_agcount), 0); 3467 bump_rotor = 1; 3468 } 3469 3470 start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); 3471 error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, 3472 XFS_FSB_TO_AGBNO(mp, target), XFS_ALLOC_FLAG_TRYLOCK); 3473 3474 if (bump_rotor) { 3475 if (args->agno == start_agno) 3476 mp->m_agfrotor = (mp->m_agfrotor + 1) % 3477 (mp->m_sb.sb_agcount * rotorstep); 3478 else 3479 mp->m_agfrotor = (args->agno * rotorstep + 1) % 3480 (mp->m_sb.sb_agcount * rotorstep); 3481 } 3482 3483 return xfs_alloc_vextent_finish(args, minimum_agno, error, true); 3484 } 3485 3486 /* 3487 * Iterate from the agno indicated via @target through to the end of the 3488 * filesystem attempting blocking allocation. This does not wrap or try a second 3489 * pass, so will not recurse into AGs lower than indicated by the target. 3490 */ 3491 int 3492 xfs_alloc_vextent_first_ag( 3493 struct xfs_alloc_arg *args, 3494 xfs_fsblock_t target) 3495 { 3496 struct xfs_mount *mp = args->mp; 3497 xfs_agnumber_t minimum_agno; 3498 xfs_agnumber_t start_agno; 3499 int error; 3500 3501 ASSERT(args->pag == NULL); 3502 3503 args->agno = NULLAGNUMBER; 3504 args->agbno = NULLAGBLOCK; 3505 3506 trace_xfs_alloc_vextent_first_ag(args); 3507 3508 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3509 if (error) { 3510 if (error == -ENOSPC) 3511 return 0; 3512 return error; 3513 } 3514 3515 start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); 3516 error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, 3517 XFS_FSB_TO_AGBNO(mp, target), 0); 3518 return xfs_alloc_vextent_finish(args, minimum_agno, error, true); 3519 } 3520 3521 /* 3522 * Allocate at the exact block target or fail. Caller is expected to hold a 3523 * perag reference in args->pag. 3524 */ 3525 int 3526 xfs_alloc_vextent_exact_bno( 3527 struct xfs_alloc_arg *args, 3528 xfs_fsblock_t target) 3529 { 3530 struct xfs_mount *mp = args->mp; 3531 xfs_agnumber_t minimum_agno; 3532 int error; 3533 3534 ASSERT(args->pag != NULL); 3535 ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target)); 3536 3537 args->agno = XFS_FSB_TO_AGNO(mp, target); 3538 args->agbno = XFS_FSB_TO_AGBNO(mp, target); 3539 3540 trace_xfs_alloc_vextent_exact_bno(args); 3541 3542 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3543 if (error) { 3544 if (error == -ENOSPC) 3545 return 0; 3546 return error; 3547 } 3548 3549 error = xfs_alloc_vextent_prepare_ag(args); 3550 if (!error && args->agbp) 3551 error = xfs_alloc_ag_vextent_exact(args); 3552 3553 return xfs_alloc_vextent_finish(args, minimum_agno, error, false); 3554 } 3555 3556 /* 3557 * Allocate an extent as close to the target as possible. If there are not 3558 * viable candidates in the AG, then fail the allocation. 3559 * 3560 * Caller may or may not have a per-ag reference in args->pag. 3561 */ 3562 int 3563 xfs_alloc_vextent_near_bno( 3564 struct xfs_alloc_arg *args, 3565 xfs_fsblock_t target) 3566 { 3567 struct xfs_mount *mp = args->mp; 3568 xfs_agnumber_t minimum_agno; 3569 bool needs_perag = args->pag == NULL; 3570 int error; 3571 3572 if (!needs_perag) 3573 ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target)); 3574 3575 args->agno = XFS_FSB_TO_AGNO(mp, target); 3576 args->agbno = XFS_FSB_TO_AGBNO(mp, target); 3577 3578 trace_xfs_alloc_vextent_near_bno(args); 3579 3580 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3581 if (error) { 3582 if (error == -ENOSPC) 3583 return 0; 3584 return error; 3585 } 3586 3587 if (needs_perag) 3588 args->pag = xfs_perag_grab(mp, args->agno); 3589 3590 error = xfs_alloc_vextent_prepare_ag(args); 3591 if (!error && args->agbp) 3592 error = xfs_alloc_ag_vextent_near(args); 3593 3594 return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag); 3595 } 3596 3597 /* Ensure that the freelist is at full capacity. */ 3598 int 3599 xfs_free_extent_fix_freelist( 3600 struct xfs_trans *tp, 3601 struct xfs_perag *pag, 3602 struct xfs_buf **agbp) 3603 { 3604 struct xfs_alloc_arg args; 3605 int error; 3606 3607 memset(&args, 0, sizeof(struct xfs_alloc_arg)); 3608 args.tp = tp; 3609 args.mp = tp->t_mountp; 3610 args.agno = pag->pag_agno; 3611 args.pag = pag; 3612 3613 /* 3614 * validate that the block number is legal - the enables us to detect 3615 * and handle a silent filesystem corruption rather than crashing. 3616 */ 3617 if (args.agno >= args.mp->m_sb.sb_agcount) 3618 return -EFSCORRUPTED; 3619 3620 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); 3621 if (error) 3622 return error; 3623 3624 *agbp = args.agbp; 3625 return 0; 3626 } 3627 3628 /* 3629 * Free an extent. 3630 * Just break up the extent address and hand off to xfs_free_ag_extent 3631 * after fixing up the freelist. 3632 */ 3633 int 3634 __xfs_free_extent( 3635 struct xfs_trans *tp, 3636 struct xfs_perag *pag, 3637 xfs_agblock_t agbno, 3638 xfs_extlen_t len, 3639 const struct xfs_owner_info *oinfo, 3640 enum xfs_ag_resv_type type, 3641 bool skip_discard) 3642 { 3643 struct xfs_mount *mp = tp->t_mountp; 3644 struct xfs_buf *agbp; 3645 struct xfs_agf *agf; 3646 int error; 3647 unsigned int busy_flags = 0; 3648 3649 ASSERT(len != 0); 3650 ASSERT(type != XFS_AG_RESV_AGFL); 3651 3652 if (XFS_TEST_ERROR(false, mp, 3653 XFS_ERRTAG_FREE_EXTENT)) 3654 return -EIO; 3655 3656 error = xfs_free_extent_fix_freelist(tp, pag, &agbp); 3657 if (error) 3658 return error; 3659 agf = agbp->b_addr; 3660 3661 if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { 3662 error = -EFSCORRUPTED; 3663 goto err_release; 3664 } 3665 3666 /* validate the extent size is legal now we have the agf locked */ 3667 if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { 3668 error = -EFSCORRUPTED; 3669 goto err_release; 3670 } 3671 3672 error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo, 3673 type); 3674 if (error) 3675 goto err_release; 3676 3677 if (skip_discard) 3678 busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; 3679 xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags); 3680 return 0; 3681 3682 err_release: 3683 xfs_trans_brelse(tp, agbp); 3684 return error; 3685 } 3686 3687 struct xfs_alloc_query_range_info { 3688 xfs_alloc_query_range_fn fn; 3689 void *priv; 3690 }; 3691 3692 /* Format btree record and pass to our callback. */ 3693 STATIC int 3694 xfs_alloc_query_range_helper( 3695 struct xfs_btree_cur *cur, 3696 const union xfs_btree_rec *rec, 3697 void *priv) 3698 { 3699 struct xfs_alloc_query_range_info *query = priv; 3700 struct xfs_alloc_rec_incore irec; 3701 xfs_failaddr_t fa; 3702 3703 xfs_alloc_btrec_to_irec(rec, &irec); 3704 fa = xfs_alloc_check_irec(cur, &irec); 3705 if (fa) 3706 return xfs_alloc_complain_bad_rec(cur, fa, &irec); 3707 3708 return query->fn(cur, &irec, query->priv); 3709 } 3710 3711 /* Find all free space within a given range of blocks. */ 3712 int 3713 xfs_alloc_query_range( 3714 struct xfs_btree_cur *cur, 3715 const struct xfs_alloc_rec_incore *low_rec, 3716 const struct xfs_alloc_rec_incore *high_rec, 3717 xfs_alloc_query_range_fn fn, 3718 void *priv) 3719 { 3720 union xfs_btree_irec low_brec; 3721 union xfs_btree_irec high_brec; 3722 struct xfs_alloc_query_range_info query; 3723 3724 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); 3725 low_brec.a = *low_rec; 3726 high_brec.a = *high_rec; 3727 query.priv = priv; 3728 query.fn = fn; 3729 return xfs_btree_query_range(cur, &low_brec, &high_brec, 3730 xfs_alloc_query_range_helper, &query); 3731 } 3732 3733 /* Find all free space records. */ 3734 int 3735 xfs_alloc_query_all( 3736 struct xfs_btree_cur *cur, 3737 xfs_alloc_query_range_fn fn, 3738 void *priv) 3739 { 3740 struct xfs_alloc_query_range_info query; 3741 3742 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO); 3743 query.priv = priv; 3744 query.fn = fn; 3745 return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); 3746 } 3747 3748 /* 3749 * Scan part of the keyspace of the free space and tell us if the area has no 3750 * records, is fully mapped by records, or is partially filled. 3751 */ 3752 int 3753 xfs_alloc_has_records( 3754 struct xfs_btree_cur *cur, 3755 xfs_agblock_t bno, 3756 xfs_extlen_t len, 3757 enum xbtree_recpacking *outcome) 3758 { 3759 union xfs_btree_irec low; 3760 union xfs_btree_irec high; 3761 3762 memset(&low, 0, sizeof(low)); 3763 low.a.ar_startblock = bno; 3764 memset(&high, 0xFF, sizeof(high)); 3765 high.a.ar_startblock = bno + len - 1; 3766 3767 return xfs_btree_has_records(cur, &low, &high, NULL, outcome); 3768 } 3769 3770 /* 3771 * Walk all the blocks in the AGFL. The @walk_fn can return any negative 3772 * error code or XFS_ITER_*. 3773 */ 3774 int 3775 xfs_agfl_walk( 3776 struct xfs_mount *mp, 3777 struct xfs_agf *agf, 3778 struct xfs_buf *agflbp, 3779 xfs_agfl_walk_fn walk_fn, 3780 void *priv) 3781 { 3782 __be32 *agfl_bno; 3783 unsigned int i; 3784 int error; 3785 3786 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 3787 i = be32_to_cpu(agf->agf_flfirst); 3788 3789 /* Nothing to walk in an empty AGFL. */ 3790 if (agf->agf_flcount == cpu_to_be32(0)) 3791 return 0; 3792 3793 /* Otherwise, walk from first to last, wrapping as needed. */ 3794 for (;;) { 3795 error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv); 3796 if (error) 3797 return error; 3798 if (i == be32_to_cpu(agf->agf_fllast)) 3799 break; 3800 if (++i == xfs_agfl_size(mp)) 3801 i = 0; 3802 } 3803 3804 return 0; 3805 } 3806 3807 int __init 3808 xfs_extfree_intent_init_cache(void) 3809 { 3810 xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent", 3811 sizeof(struct xfs_extent_free_item), 3812 0, 0, NULL); 3813 3814 return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM; 3815 } 3816 3817 void 3818 xfs_extfree_intent_destroy_cache(void) 3819 { 3820 kmem_cache_destroy(xfs_extfree_item_cache); 3821 xfs_extfree_item_cache = NULL; 3822 } 3823