1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_shared.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_defer.h" 15 #include "xfs_btree.h" 16 #include "xfs_rmap.h" 17 #include "xfs_alloc_btree.h" 18 #include "xfs_alloc.h" 19 #include "xfs_extent_busy.h" 20 #include "xfs_errortag.h" 21 #include "xfs_error.h" 22 #include "xfs_trace.h" 23 #include "xfs_trans.h" 24 #include "xfs_buf_item.h" 25 #include "xfs_log.h" 26 #include "xfs_ag.h" 27 #include "xfs_ag_resv.h" 28 #include "xfs_bmap.h" 29 #include "xfs_health.h" 30 31 struct kmem_cache *xfs_extfree_item_cache; 32 33 struct workqueue_struct *xfs_alloc_wq; 34 35 #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) 36 37 #define XFSA_FIXUP_BNO_OK 1 38 #define XFSA_FIXUP_CNT_OK 2 39 40 /* 41 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in 42 * the beginning of the block for a proper header with the location information 43 * and CRC. 44 */ 45 unsigned int 46 xfs_agfl_size( 47 struct xfs_mount *mp) 48 { 49 unsigned int size = mp->m_sb.sb_sectsize; 50 51 if (xfs_has_crc(mp)) 52 size -= sizeof(struct xfs_agfl); 53 54 return size / sizeof(xfs_agblock_t); 55 } 56 57 unsigned int 58 xfs_refc_block( 59 struct xfs_mount *mp) 60 { 61 if (xfs_has_rmapbt(mp)) 62 return XFS_RMAP_BLOCK(mp) + 1; 63 if (xfs_has_finobt(mp)) 64 return XFS_FIBT_BLOCK(mp) + 1; 65 return XFS_IBT_BLOCK(mp) + 1; 66 } 67 68 xfs_extlen_t 69 xfs_prealloc_blocks( 70 struct xfs_mount *mp) 71 { 72 if (xfs_has_reflink(mp)) 73 return xfs_refc_block(mp) + 1; 74 if (xfs_has_rmapbt(mp)) 75 return XFS_RMAP_BLOCK(mp) + 1; 76 if (xfs_has_finobt(mp)) 77 return XFS_FIBT_BLOCK(mp) + 1; 78 return XFS_IBT_BLOCK(mp) + 1; 79 } 80 81 /* 82 * The number of blocks per AG that we withhold from xfs_dec_fdblocks to 83 * guarantee that we can refill the AGFL prior to allocating space in a nearly 84 * full AG. Although the space described by the free space btrees, the 85 * blocks used by the freesp btrees themselves, and the blocks owned by the 86 * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk 87 * free space in the AG drop so low that the free space btrees cannot refill an 88 * empty AGFL up to the minimum level. Rather than grind through empty AGs 89 * until the fs goes down, we subtract this many AG blocks from the incore 90 * fdblocks to ensure user allocation does not overcommit the space the 91 * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to 92 * withhold space from xfs_dec_fdblocks, so we do not account for that here. 93 */ 94 #define XFS_ALLOCBT_AGFL_RESERVE 4 95 96 /* 97 * Compute the number of blocks that we set aside to guarantee the ability to 98 * refill the AGFL and handle a full bmap btree split. 99 * 100 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of 101 * AGF buffer (PV 947395), we place constraints on the relationship among 102 * actual allocations for data blocks, freelist blocks, and potential file data 103 * bmap btree blocks. However, these restrictions may result in no actual space 104 * allocated for a delayed extent, for example, a data block in a certain AG is 105 * allocated but there is no additional block for the additional bmap btree 106 * block due to a split of the bmap btree of the file. The result of this may 107 * lead to an infinite loop when the file gets flushed to disk and all delayed 108 * extents need to be actually allocated. To get around this, we explicitly set 109 * aside a few blocks which will not be reserved in delayed allocation. 110 * 111 * For each AG, we need to reserve enough blocks to replenish a totally empty 112 * AGFL and 4 more to handle a potential split of the file's bmap btree. 113 */ 114 unsigned int 115 xfs_alloc_set_aside( 116 struct xfs_mount *mp) 117 { 118 return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4); 119 } 120 121 /* 122 * When deciding how much space to allocate out of an AG, we limit the 123 * allocation maximum size to the size the AG. However, we cannot use all the 124 * blocks in the AG - some are permanently used by metadata. These 125 * blocks are generally: 126 * - the AG superblock, AGF, AGI and AGFL 127 * - the AGF (bno and cnt) and AGI btree root blocks, and optionally 128 * the AGI free inode and rmap btree root blocks. 129 * - blocks on the AGFL according to xfs_alloc_set_aside() limits 130 * - the rmapbt root block 131 * 132 * The AG headers are sector sized, so the amount of space they take up is 133 * dependent on filesystem geometry. The others are all single blocks. 134 */ 135 unsigned int 136 xfs_alloc_ag_max_usable( 137 struct xfs_mount *mp) 138 { 139 unsigned int blocks; 140 141 blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ 142 blocks += XFS_ALLOCBT_AGFL_RESERVE; 143 blocks += 3; /* AGF, AGI btree root blocks */ 144 if (xfs_has_finobt(mp)) 145 blocks++; /* finobt root block */ 146 if (xfs_has_rmapbt(mp)) 147 blocks++; /* rmap root block */ 148 if (xfs_has_reflink(mp)) 149 blocks++; /* refcount root block */ 150 151 return mp->m_sb.sb_agblocks - blocks; 152 } 153 154 155 static int 156 xfs_alloc_lookup( 157 struct xfs_btree_cur *cur, 158 xfs_lookup_t dir, 159 xfs_agblock_t bno, 160 xfs_extlen_t len, 161 int *stat) 162 { 163 int error; 164 165 cur->bc_rec.a.ar_startblock = bno; 166 cur->bc_rec.a.ar_blockcount = len; 167 error = xfs_btree_lookup(cur, dir, stat); 168 if (*stat == 1) 169 cur->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE; 170 else 171 cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; 172 return error; 173 } 174 175 /* 176 * Lookup the record equal to [bno, len] in the btree given by cur. 177 */ 178 static inline int /* error */ 179 xfs_alloc_lookup_eq( 180 struct xfs_btree_cur *cur, /* btree cursor */ 181 xfs_agblock_t bno, /* starting block of extent */ 182 xfs_extlen_t len, /* length of extent */ 183 int *stat) /* success/failure */ 184 { 185 return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, bno, len, stat); 186 } 187 188 /* 189 * Lookup the first record greater than or equal to [bno, len] 190 * in the btree given by cur. 191 */ 192 int /* error */ 193 xfs_alloc_lookup_ge( 194 struct xfs_btree_cur *cur, /* btree cursor */ 195 xfs_agblock_t bno, /* starting block of extent */ 196 xfs_extlen_t len, /* length of extent */ 197 int *stat) /* success/failure */ 198 { 199 return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, bno, len, stat); 200 } 201 202 /* 203 * Lookup the first record less than or equal to [bno, len] 204 * in the btree given by cur. 205 */ 206 int /* error */ 207 xfs_alloc_lookup_le( 208 struct xfs_btree_cur *cur, /* btree cursor */ 209 xfs_agblock_t bno, /* starting block of extent */ 210 xfs_extlen_t len, /* length of extent */ 211 int *stat) /* success/failure */ 212 { 213 return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, bno, len, stat); 214 } 215 216 static inline bool 217 xfs_alloc_cur_active( 218 struct xfs_btree_cur *cur) 219 { 220 return cur && (cur->bc_flags & XFS_BTREE_ALLOCBT_ACTIVE); 221 } 222 223 /* 224 * Update the record referred to by cur to the value given 225 * by [bno, len]. 226 * This either works (return 0) or gets an EFSCORRUPTED error. 227 */ 228 STATIC int /* error */ 229 xfs_alloc_update( 230 struct xfs_btree_cur *cur, /* btree cursor */ 231 xfs_agblock_t bno, /* starting block of extent */ 232 xfs_extlen_t len) /* length of extent */ 233 { 234 union xfs_btree_rec rec; 235 236 rec.alloc.ar_startblock = cpu_to_be32(bno); 237 rec.alloc.ar_blockcount = cpu_to_be32(len); 238 return xfs_btree_update(cur, &rec); 239 } 240 241 /* Convert the ondisk btree record to its incore representation. */ 242 void 243 xfs_alloc_btrec_to_irec( 244 const union xfs_btree_rec *rec, 245 struct xfs_alloc_rec_incore *irec) 246 { 247 irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); 248 irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); 249 } 250 251 /* Simple checks for free space records. */ 252 xfs_failaddr_t 253 xfs_alloc_check_irec( 254 struct xfs_perag *pag, 255 const struct xfs_alloc_rec_incore *irec) 256 { 257 if (irec->ar_blockcount == 0) 258 return __this_address; 259 260 /* check for valid extent range, including overflow */ 261 if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount)) 262 return __this_address; 263 264 return NULL; 265 } 266 267 static inline int 268 xfs_alloc_complain_bad_rec( 269 struct xfs_btree_cur *cur, 270 xfs_failaddr_t fa, 271 const struct xfs_alloc_rec_incore *irec) 272 { 273 struct xfs_mount *mp = cur->bc_mp; 274 275 xfs_warn(mp, 276 "%sbt record corruption in AG %d detected at %pS!", 277 cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa); 278 xfs_warn(mp, 279 "start block 0x%x block count 0x%x", irec->ar_startblock, 280 irec->ar_blockcount); 281 xfs_btree_mark_sick(cur); 282 return -EFSCORRUPTED; 283 } 284 285 /* 286 * Get the data from the pointed-to record. 287 */ 288 int /* error */ 289 xfs_alloc_get_rec( 290 struct xfs_btree_cur *cur, /* btree cursor */ 291 xfs_agblock_t *bno, /* output: starting block of extent */ 292 xfs_extlen_t *len, /* output: length of extent */ 293 int *stat) /* output: success/failure */ 294 { 295 struct xfs_alloc_rec_incore irec; 296 union xfs_btree_rec *rec; 297 xfs_failaddr_t fa; 298 int error; 299 300 error = xfs_btree_get_rec(cur, &rec, stat); 301 if (error || !(*stat)) 302 return error; 303 304 xfs_alloc_btrec_to_irec(rec, &irec); 305 fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); 306 if (fa) 307 return xfs_alloc_complain_bad_rec(cur, fa, &irec); 308 309 *bno = irec.ar_startblock; 310 *len = irec.ar_blockcount; 311 return 0; 312 } 313 314 /* 315 * Compute aligned version of the found extent. 316 * Takes alignment and min length into account. 317 */ 318 STATIC bool 319 xfs_alloc_compute_aligned( 320 xfs_alloc_arg_t *args, /* allocation argument structure */ 321 xfs_agblock_t foundbno, /* starting block in found extent */ 322 xfs_extlen_t foundlen, /* length in found extent */ 323 xfs_agblock_t *resbno, /* result block number */ 324 xfs_extlen_t *reslen, /* result length */ 325 unsigned *busy_gen) 326 { 327 xfs_agblock_t bno = foundbno; 328 xfs_extlen_t len = foundlen; 329 xfs_extlen_t diff; 330 bool busy; 331 332 /* Trim busy sections out of found extent */ 333 busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen); 334 335 /* 336 * If we have a largish extent that happens to start before min_agbno, 337 * see if we can shift it into range... 338 */ 339 if (bno < args->min_agbno && bno + len > args->min_agbno) { 340 diff = args->min_agbno - bno; 341 if (len > diff) { 342 bno += diff; 343 len -= diff; 344 } 345 } 346 347 if (args->alignment > 1 && len >= args->minlen) { 348 xfs_agblock_t aligned_bno = roundup(bno, args->alignment); 349 350 diff = aligned_bno - bno; 351 352 *resbno = aligned_bno; 353 *reslen = diff >= len ? 0 : len - diff; 354 } else { 355 *resbno = bno; 356 *reslen = len; 357 } 358 359 return busy; 360 } 361 362 /* 363 * Compute best start block and diff for "near" allocations. 364 * freelen >= wantlen already checked by caller. 365 */ 366 STATIC xfs_extlen_t /* difference value (absolute) */ 367 xfs_alloc_compute_diff( 368 xfs_agblock_t wantbno, /* target starting block */ 369 xfs_extlen_t wantlen, /* target length */ 370 xfs_extlen_t alignment, /* target alignment */ 371 int datatype, /* are we allocating data? */ 372 xfs_agblock_t freebno, /* freespace's starting block */ 373 xfs_extlen_t freelen, /* freespace's length */ 374 xfs_agblock_t *newbnop) /* result: best start block from free */ 375 { 376 xfs_agblock_t freeend; /* end of freespace extent */ 377 xfs_agblock_t newbno1; /* return block number */ 378 xfs_agblock_t newbno2; /* other new block number */ 379 xfs_extlen_t newlen1=0; /* length with newbno1 */ 380 xfs_extlen_t newlen2=0; /* length with newbno2 */ 381 xfs_agblock_t wantend; /* end of target extent */ 382 bool userdata = datatype & XFS_ALLOC_USERDATA; 383 384 ASSERT(freelen >= wantlen); 385 freeend = freebno + freelen; 386 wantend = wantbno + wantlen; 387 /* 388 * We want to allocate from the start of a free extent if it is past 389 * the desired block or if we are allocating user data and the free 390 * extent is before desired block. The second case is there to allow 391 * for contiguous allocation from the remaining free space if the file 392 * grows in the short term. 393 */ 394 if (freebno >= wantbno || (userdata && freeend < wantend)) { 395 if ((newbno1 = roundup(freebno, alignment)) >= freeend) 396 newbno1 = NULLAGBLOCK; 397 } else if (freeend >= wantend && alignment > 1) { 398 newbno1 = roundup(wantbno, alignment); 399 newbno2 = newbno1 - alignment; 400 if (newbno1 >= freeend) 401 newbno1 = NULLAGBLOCK; 402 else 403 newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1); 404 if (newbno2 < freebno) 405 newbno2 = NULLAGBLOCK; 406 else 407 newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2); 408 if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { 409 if (newlen1 < newlen2 || 410 (newlen1 == newlen2 && 411 XFS_ABSDIFF(newbno1, wantbno) > 412 XFS_ABSDIFF(newbno2, wantbno))) 413 newbno1 = newbno2; 414 } else if (newbno2 != NULLAGBLOCK) 415 newbno1 = newbno2; 416 } else if (freeend >= wantend) { 417 newbno1 = wantbno; 418 } else if (alignment > 1) { 419 newbno1 = roundup(freeend - wantlen, alignment); 420 if (newbno1 > freeend - wantlen && 421 newbno1 - alignment >= freebno) 422 newbno1 -= alignment; 423 else if (newbno1 >= freeend) 424 newbno1 = NULLAGBLOCK; 425 } else 426 newbno1 = freeend - wantlen; 427 *newbnop = newbno1; 428 return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno); 429 } 430 431 /* 432 * Fix up the length, based on mod and prod. 433 * len should be k * prod + mod for some k. 434 * If len is too small it is returned unchanged. 435 * If len hits maxlen it is left alone. 436 */ 437 STATIC void 438 xfs_alloc_fix_len( 439 xfs_alloc_arg_t *args) /* allocation argument structure */ 440 { 441 xfs_extlen_t k; 442 xfs_extlen_t rlen; 443 444 ASSERT(args->mod < args->prod); 445 rlen = args->len; 446 ASSERT(rlen >= args->minlen); 447 ASSERT(rlen <= args->maxlen); 448 if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen || 449 (args->mod == 0 && rlen < args->prod)) 450 return; 451 k = rlen % args->prod; 452 if (k == args->mod) 453 return; 454 if (k > args->mod) 455 rlen = rlen - (k - args->mod); 456 else 457 rlen = rlen - args->prod + (args->mod - k); 458 /* casts to (int) catch length underflows */ 459 if ((int)rlen < (int)args->minlen) 460 return; 461 ASSERT(rlen >= args->minlen && rlen <= args->maxlen); 462 ASSERT(rlen % args->prod == args->mod); 463 ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >= 464 rlen + args->minleft); 465 args->len = rlen; 466 } 467 468 /* 469 * Update the two btrees, logically removing from freespace the extent 470 * starting at rbno, rlen blocks. The extent is contained within the 471 * actual (current) free extent fbno for flen blocks. 472 * Flags are passed in indicating whether the cursors are set to the 473 * relevant records. 474 */ 475 STATIC int /* error code */ 476 xfs_alloc_fixup_trees( 477 struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */ 478 struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */ 479 xfs_agblock_t fbno, /* starting block of free extent */ 480 xfs_extlen_t flen, /* length of free extent */ 481 xfs_agblock_t rbno, /* starting block of returned extent */ 482 xfs_extlen_t rlen, /* length of returned extent */ 483 int flags) /* flags, XFSA_FIXUP_... */ 484 { 485 int error; /* error code */ 486 int i; /* operation results */ 487 xfs_agblock_t nfbno1; /* first new free startblock */ 488 xfs_agblock_t nfbno2; /* second new free startblock */ 489 xfs_extlen_t nflen1=0; /* first new free length */ 490 xfs_extlen_t nflen2=0; /* second new free length */ 491 struct xfs_mount *mp; 492 493 mp = cnt_cur->bc_mp; 494 495 /* 496 * Look up the record in the by-size tree if necessary. 497 */ 498 if (flags & XFSA_FIXUP_CNT_OK) { 499 #ifdef DEBUG 500 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) 501 return error; 502 if (XFS_IS_CORRUPT(mp, 503 i != 1 || 504 nfbno1 != fbno || 505 nflen1 != flen)) { 506 xfs_btree_mark_sick(cnt_cur); 507 return -EFSCORRUPTED; 508 } 509 #endif 510 } else { 511 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) 512 return error; 513 if (XFS_IS_CORRUPT(mp, i != 1)) { 514 xfs_btree_mark_sick(cnt_cur); 515 return -EFSCORRUPTED; 516 } 517 } 518 /* 519 * Look up the record in the by-block tree if necessary. 520 */ 521 if (flags & XFSA_FIXUP_BNO_OK) { 522 #ifdef DEBUG 523 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) 524 return error; 525 if (XFS_IS_CORRUPT(mp, 526 i != 1 || 527 nfbno1 != fbno || 528 nflen1 != flen)) { 529 xfs_btree_mark_sick(bno_cur); 530 return -EFSCORRUPTED; 531 } 532 #endif 533 } else { 534 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) 535 return error; 536 if (XFS_IS_CORRUPT(mp, i != 1)) { 537 xfs_btree_mark_sick(bno_cur); 538 return -EFSCORRUPTED; 539 } 540 } 541 542 #ifdef DEBUG 543 if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) { 544 struct xfs_btree_block *bnoblock; 545 struct xfs_btree_block *cntblock; 546 547 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp); 548 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp); 549 550 if (XFS_IS_CORRUPT(mp, 551 bnoblock->bb_numrecs != 552 cntblock->bb_numrecs)) { 553 xfs_btree_mark_sick(bno_cur); 554 return -EFSCORRUPTED; 555 } 556 } 557 #endif 558 559 /* 560 * Deal with all four cases: the allocated record is contained 561 * within the freespace record, so we can have new freespace 562 * at either (or both) end, or no freespace remaining. 563 */ 564 if (rbno == fbno && rlen == flen) 565 nfbno1 = nfbno2 = NULLAGBLOCK; 566 else if (rbno == fbno) { 567 nfbno1 = rbno + rlen; 568 nflen1 = flen - rlen; 569 nfbno2 = NULLAGBLOCK; 570 } else if (rbno + rlen == fbno + flen) { 571 nfbno1 = fbno; 572 nflen1 = flen - rlen; 573 nfbno2 = NULLAGBLOCK; 574 } else { 575 nfbno1 = fbno; 576 nflen1 = rbno - fbno; 577 nfbno2 = rbno + rlen; 578 nflen2 = (fbno + flen) - nfbno2; 579 } 580 /* 581 * Delete the entry from the by-size btree. 582 */ 583 if ((error = xfs_btree_delete(cnt_cur, &i))) 584 return error; 585 if (XFS_IS_CORRUPT(mp, i != 1)) { 586 xfs_btree_mark_sick(cnt_cur); 587 return -EFSCORRUPTED; 588 } 589 /* 590 * Add new by-size btree entry(s). 591 */ 592 if (nfbno1 != NULLAGBLOCK) { 593 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) 594 return error; 595 if (XFS_IS_CORRUPT(mp, i != 0)) { 596 xfs_btree_mark_sick(cnt_cur); 597 return -EFSCORRUPTED; 598 } 599 if ((error = xfs_btree_insert(cnt_cur, &i))) 600 return error; 601 if (XFS_IS_CORRUPT(mp, i != 1)) { 602 xfs_btree_mark_sick(cnt_cur); 603 return -EFSCORRUPTED; 604 } 605 } 606 if (nfbno2 != NULLAGBLOCK) { 607 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) 608 return error; 609 if (XFS_IS_CORRUPT(mp, i != 0)) { 610 xfs_btree_mark_sick(cnt_cur); 611 return -EFSCORRUPTED; 612 } 613 if ((error = xfs_btree_insert(cnt_cur, &i))) 614 return error; 615 if (XFS_IS_CORRUPT(mp, i != 1)) { 616 xfs_btree_mark_sick(cnt_cur); 617 return -EFSCORRUPTED; 618 } 619 } 620 /* 621 * Fix up the by-block btree entry(s). 622 */ 623 if (nfbno1 == NULLAGBLOCK) { 624 /* 625 * No remaining freespace, just delete the by-block tree entry. 626 */ 627 if ((error = xfs_btree_delete(bno_cur, &i))) 628 return error; 629 if (XFS_IS_CORRUPT(mp, i != 1)) { 630 xfs_btree_mark_sick(bno_cur); 631 return -EFSCORRUPTED; 632 } 633 } else { 634 /* 635 * Update the by-block entry to start later|be shorter. 636 */ 637 if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1))) 638 return error; 639 } 640 if (nfbno2 != NULLAGBLOCK) { 641 /* 642 * 2 resulting free entries, need to add one. 643 */ 644 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) 645 return error; 646 if (XFS_IS_CORRUPT(mp, i != 0)) { 647 xfs_btree_mark_sick(bno_cur); 648 return -EFSCORRUPTED; 649 } 650 if ((error = xfs_btree_insert(bno_cur, &i))) 651 return error; 652 if (XFS_IS_CORRUPT(mp, i != 1)) { 653 xfs_btree_mark_sick(bno_cur); 654 return -EFSCORRUPTED; 655 } 656 } 657 return 0; 658 } 659 660 /* 661 * We do not verify the AGFL contents against AGF-based index counters here, 662 * even though we may have access to the perag that contains shadow copies. We 663 * don't know if the AGF based counters have been checked, and if they have they 664 * still may be inconsistent because they haven't yet been reset on the first 665 * allocation after the AGF has been read in. 666 * 667 * This means we can only check that all agfl entries contain valid or null 668 * values because we can't reliably determine the active range to exclude 669 * NULLAGBNO as a valid value. 670 * 671 * However, we can't even do that for v4 format filesystems because there are 672 * old versions of mkfs out there that does not initialise the AGFL to known, 673 * verifiable values. HEnce we can't tell the difference between a AGFL block 674 * allocated by mkfs and a corrupted AGFL block here on v4 filesystems. 675 * 676 * As a result, we can only fully validate AGFL block numbers when we pull them 677 * from the freelist in xfs_alloc_get_freelist(). 678 */ 679 static xfs_failaddr_t 680 xfs_agfl_verify( 681 struct xfs_buf *bp) 682 { 683 struct xfs_mount *mp = bp->b_mount; 684 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); 685 __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp); 686 int i; 687 688 if (!xfs_has_crc(mp)) 689 return NULL; 690 691 if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) 692 return __this_address; 693 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) 694 return __this_address; 695 /* 696 * during growfs operations, the perag is not fully initialised, 697 * so we can't use it for any useful checking. growfs ensures we can't 698 * use it by using uncached buffers that don't have the perag attached 699 * so we can detect and avoid this problem. 700 */ 701 if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) 702 return __this_address; 703 704 for (i = 0; i < xfs_agfl_size(mp); i++) { 705 if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK && 706 be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks) 707 return __this_address; 708 } 709 710 if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn))) 711 return __this_address; 712 return NULL; 713 } 714 715 static void 716 xfs_agfl_read_verify( 717 struct xfs_buf *bp) 718 { 719 struct xfs_mount *mp = bp->b_mount; 720 xfs_failaddr_t fa; 721 722 /* 723 * There is no verification of non-crc AGFLs because mkfs does not 724 * initialise the AGFL to zero or NULL. Hence the only valid part of the 725 * AGFL is what the AGF says is active. We can't get to the AGF, so we 726 * can't verify just those entries are valid. 727 */ 728 if (!xfs_has_crc(mp)) 729 return; 730 731 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) 732 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 733 else { 734 fa = xfs_agfl_verify(bp); 735 if (fa) 736 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 737 } 738 } 739 740 static void 741 xfs_agfl_write_verify( 742 struct xfs_buf *bp) 743 { 744 struct xfs_mount *mp = bp->b_mount; 745 struct xfs_buf_log_item *bip = bp->b_log_item; 746 xfs_failaddr_t fa; 747 748 /* no verification of non-crc AGFLs */ 749 if (!xfs_has_crc(mp)) 750 return; 751 752 fa = xfs_agfl_verify(bp); 753 if (fa) { 754 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 755 return; 756 } 757 758 if (bip) 759 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); 760 761 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF); 762 } 763 764 const struct xfs_buf_ops xfs_agfl_buf_ops = { 765 .name = "xfs_agfl", 766 .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) }, 767 .verify_read = xfs_agfl_read_verify, 768 .verify_write = xfs_agfl_write_verify, 769 .verify_struct = xfs_agfl_verify, 770 }; 771 772 /* 773 * Read in the allocation group free block array. 774 */ 775 int 776 xfs_alloc_read_agfl( 777 struct xfs_perag *pag, 778 struct xfs_trans *tp, 779 struct xfs_buf **bpp) 780 { 781 struct xfs_mount *mp = pag->pag_mount; 782 struct xfs_buf *bp; 783 int error; 784 785 error = xfs_trans_read_buf( 786 mp, tp, mp->m_ddev_targp, 787 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)), 788 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); 789 if (xfs_metadata_is_sick(error)) 790 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); 791 if (error) 792 return error; 793 xfs_buf_set_ref(bp, XFS_AGFL_REF); 794 *bpp = bp; 795 return 0; 796 } 797 798 STATIC int 799 xfs_alloc_update_counters( 800 struct xfs_trans *tp, 801 struct xfs_buf *agbp, 802 long len) 803 { 804 struct xfs_agf *agf = agbp->b_addr; 805 806 agbp->b_pag->pagf_freeblks += len; 807 be32_add_cpu(&agf->agf_freeblks, len); 808 809 if (unlikely(be32_to_cpu(agf->agf_freeblks) > 810 be32_to_cpu(agf->agf_length))) { 811 xfs_buf_mark_corrupt(agbp); 812 xfs_ag_mark_sick(agbp->b_pag, XFS_SICK_AG_AGF); 813 return -EFSCORRUPTED; 814 } 815 816 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); 817 return 0; 818 } 819 820 /* 821 * Block allocation algorithm and data structures. 822 */ 823 struct xfs_alloc_cur { 824 struct xfs_btree_cur *cnt; /* btree cursors */ 825 struct xfs_btree_cur *bnolt; 826 struct xfs_btree_cur *bnogt; 827 xfs_extlen_t cur_len;/* current search length */ 828 xfs_agblock_t rec_bno;/* extent startblock */ 829 xfs_extlen_t rec_len;/* extent length */ 830 xfs_agblock_t bno; /* alloc bno */ 831 xfs_extlen_t len; /* alloc len */ 832 xfs_extlen_t diff; /* diff from search bno */ 833 unsigned int busy_gen;/* busy state */ 834 bool busy; 835 }; 836 837 /* 838 * Set up cursors, etc. in the extent allocation cursor. This function can be 839 * called multiple times to reset an initialized structure without having to 840 * reallocate cursors. 841 */ 842 static int 843 xfs_alloc_cur_setup( 844 struct xfs_alloc_arg *args, 845 struct xfs_alloc_cur *acur) 846 { 847 int error; 848 int i; 849 850 acur->cur_len = args->maxlen; 851 acur->rec_bno = 0; 852 acur->rec_len = 0; 853 acur->bno = 0; 854 acur->len = 0; 855 acur->diff = -1; 856 acur->busy = false; 857 acur->busy_gen = 0; 858 859 /* 860 * Perform an initial cntbt lookup to check for availability of maxlen 861 * extents. If this fails, we'll return -ENOSPC to signal the caller to 862 * attempt a small allocation. 863 */ 864 if (!acur->cnt) 865 acur->cnt = xfs_cntbt_init_cursor(args->mp, args->tp, 866 args->agbp, args->pag); 867 error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); 868 if (error) 869 return error; 870 871 /* 872 * Allocate the bnobt left and right search cursors. 873 */ 874 if (!acur->bnolt) 875 acur->bnolt = xfs_bnobt_init_cursor(args->mp, args->tp, 876 args->agbp, args->pag); 877 if (!acur->bnogt) 878 acur->bnogt = xfs_bnobt_init_cursor(args->mp, args->tp, 879 args->agbp, args->pag); 880 return i == 1 ? 0 : -ENOSPC; 881 } 882 883 static void 884 xfs_alloc_cur_close( 885 struct xfs_alloc_cur *acur, 886 bool error) 887 { 888 int cur_error = XFS_BTREE_NOERROR; 889 890 if (error) 891 cur_error = XFS_BTREE_ERROR; 892 893 if (acur->cnt) 894 xfs_btree_del_cursor(acur->cnt, cur_error); 895 if (acur->bnolt) 896 xfs_btree_del_cursor(acur->bnolt, cur_error); 897 if (acur->bnogt) 898 xfs_btree_del_cursor(acur->bnogt, cur_error); 899 acur->cnt = acur->bnolt = acur->bnogt = NULL; 900 } 901 902 /* 903 * Check an extent for allocation and track the best available candidate in the 904 * allocation structure. The cursor is deactivated if it has entered an out of 905 * range state based on allocation arguments. Optionally return the extent 906 * extent geometry and allocation status if requested by the caller. 907 */ 908 static int 909 xfs_alloc_cur_check( 910 struct xfs_alloc_arg *args, 911 struct xfs_alloc_cur *acur, 912 struct xfs_btree_cur *cur, 913 int *new) 914 { 915 int error, i; 916 xfs_agblock_t bno, bnoa, bnew; 917 xfs_extlen_t len, lena, diff = -1; 918 bool busy; 919 unsigned busy_gen = 0; 920 bool deactivate = false; 921 bool isbnobt = xfs_btree_is_bno(cur->bc_ops); 922 923 *new = 0; 924 925 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 926 if (error) 927 return error; 928 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 929 xfs_btree_mark_sick(cur); 930 return -EFSCORRUPTED; 931 } 932 933 /* 934 * Check minlen and deactivate a cntbt cursor if out of acceptable size 935 * range (i.e., walking backwards looking for a minlen extent). 936 */ 937 if (len < args->minlen) { 938 deactivate = !isbnobt; 939 goto out; 940 } 941 942 busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena, 943 &busy_gen); 944 acur->busy |= busy; 945 if (busy) 946 acur->busy_gen = busy_gen; 947 /* deactivate a bnobt cursor outside of locality range */ 948 if (bnoa < args->min_agbno || bnoa > args->max_agbno) { 949 deactivate = isbnobt; 950 goto out; 951 } 952 if (lena < args->minlen) 953 goto out; 954 955 args->len = XFS_EXTLEN_MIN(lena, args->maxlen); 956 xfs_alloc_fix_len(args); 957 ASSERT(args->len >= args->minlen); 958 if (args->len < acur->len) 959 goto out; 960 961 /* 962 * We have an aligned record that satisfies minlen and beats or matches 963 * the candidate extent size. Compare locality for near allocation mode. 964 */ 965 diff = xfs_alloc_compute_diff(args->agbno, args->len, 966 args->alignment, args->datatype, 967 bnoa, lena, &bnew); 968 if (bnew == NULLAGBLOCK) 969 goto out; 970 971 /* 972 * Deactivate a bnobt cursor with worse locality than the current best. 973 */ 974 if (diff > acur->diff) { 975 deactivate = isbnobt; 976 goto out; 977 } 978 979 ASSERT(args->len > acur->len || 980 (args->len == acur->len && diff <= acur->diff)); 981 acur->rec_bno = bno; 982 acur->rec_len = len; 983 acur->bno = bnew; 984 acur->len = args->len; 985 acur->diff = diff; 986 *new = 1; 987 988 /* 989 * We're done if we found a perfect allocation. This only deactivates 990 * the current cursor, but this is just an optimization to terminate a 991 * cntbt search that otherwise runs to the edge of the tree. 992 */ 993 if (acur->diff == 0 && acur->len == args->maxlen) 994 deactivate = true; 995 out: 996 if (deactivate) 997 cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; 998 trace_xfs_alloc_cur_check(cur, bno, len, diff, *new); 999 return 0; 1000 } 1001 1002 /* 1003 * Complete an allocation of a candidate extent. Remove the extent from both 1004 * trees and update the args structure. 1005 */ 1006 STATIC int 1007 xfs_alloc_cur_finish( 1008 struct xfs_alloc_arg *args, 1009 struct xfs_alloc_cur *acur) 1010 { 1011 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; 1012 int error; 1013 1014 ASSERT(acur->cnt && acur->bnolt); 1015 ASSERT(acur->bno >= acur->rec_bno); 1016 ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); 1017 ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length)); 1018 1019 error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, 1020 acur->rec_len, acur->bno, acur->len, 0); 1021 if (error) 1022 return error; 1023 1024 args->agbno = acur->bno; 1025 args->len = acur->len; 1026 args->wasfromfl = 0; 1027 1028 trace_xfs_alloc_cur(args); 1029 return 0; 1030 } 1031 1032 /* 1033 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses 1034 * bno optimized lookup to search for extents with ideal size and locality. 1035 */ 1036 STATIC int 1037 xfs_alloc_cntbt_iter( 1038 struct xfs_alloc_arg *args, 1039 struct xfs_alloc_cur *acur) 1040 { 1041 struct xfs_btree_cur *cur = acur->cnt; 1042 xfs_agblock_t bno; 1043 xfs_extlen_t len, cur_len; 1044 int error; 1045 int i; 1046 1047 if (!xfs_alloc_cur_active(cur)) 1048 return 0; 1049 1050 /* locality optimized lookup */ 1051 cur_len = acur->cur_len; 1052 error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i); 1053 if (error) 1054 return error; 1055 if (i == 0) 1056 return 0; 1057 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 1058 if (error) 1059 return error; 1060 1061 /* check the current record and update search length from it */ 1062 error = xfs_alloc_cur_check(args, acur, cur, &i); 1063 if (error) 1064 return error; 1065 ASSERT(len >= acur->cur_len); 1066 acur->cur_len = len; 1067 1068 /* 1069 * We looked up the first record >= [agbno, len] above. The agbno is a 1070 * secondary key and so the current record may lie just before or after 1071 * agbno. If it is past agbno, check the previous record too so long as 1072 * the length matches as it may be closer. Don't check a smaller record 1073 * because that could deactivate our cursor. 1074 */ 1075 if (bno > args->agbno) { 1076 error = xfs_btree_decrement(cur, 0, &i); 1077 if (!error && i) { 1078 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 1079 if (!error && i && len == acur->cur_len) 1080 error = xfs_alloc_cur_check(args, acur, cur, 1081 &i); 1082 } 1083 if (error) 1084 return error; 1085 } 1086 1087 /* 1088 * Increment the search key until we find at least one allocation 1089 * candidate or if the extent we found was larger. Otherwise, double the 1090 * search key to optimize the search. Efficiency is more important here 1091 * than absolute best locality. 1092 */ 1093 cur_len <<= 1; 1094 if (!acur->len || acur->cur_len >= cur_len) 1095 acur->cur_len++; 1096 else 1097 acur->cur_len = cur_len; 1098 1099 return error; 1100 } 1101 1102 /* 1103 * Deal with the case where only small freespaces remain. Either return the 1104 * contents of the last freespace record, or allocate space from the freelist if 1105 * there is nothing in the tree. 1106 */ 1107 STATIC int /* error */ 1108 xfs_alloc_ag_vextent_small( 1109 struct xfs_alloc_arg *args, /* allocation argument structure */ 1110 struct xfs_btree_cur *ccur, /* optional by-size cursor */ 1111 xfs_agblock_t *fbnop, /* result block number */ 1112 xfs_extlen_t *flenp, /* result length */ 1113 int *stat) /* status: 0-freelist, 1-normal/none */ 1114 { 1115 struct xfs_agf *agf = args->agbp->b_addr; 1116 int error = 0; 1117 xfs_agblock_t fbno = NULLAGBLOCK; 1118 xfs_extlen_t flen = 0; 1119 int i = 0; 1120 1121 /* 1122 * If a cntbt cursor is provided, try to allocate the largest record in 1123 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the 1124 * allocation. Make sure to respect minleft even when pulling from the 1125 * freelist. 1126 */ 1127 if (ccur) 1128 error = xfs_btree_decrement(ccur, 0, &i); 1129 if (error) 1130 goto error; 1131 if (i) { 1132 error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i); 1133 if (error) 1134 goto error; 1135 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1136 xfs_btree_mark_sick(ccur); 1137 error = -EFSCORRUPTED; 1138 goto error; 1139 } 1140 goto out; 1141 } 1142 1143 if (args->minlen != 1 || args->alignment != 1 || 1144 args->resv == XFS_AG_RESV_AGFL || 1145 be32_to_cpu(agf->agf_flcount) <= args->minleft) 1146 goto out; 1147 1148 error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp, 1149 &fbno, 0); 1150 if (error) 1151 goto error; 1152 if (fbno == NULLAGBLOCK) 1153 goto out; 1154 1155 xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1, 1156 (args->datatype & XFS_ALLOC_NOBUSY)); 1157 1158 if (args->datatype & XFS_ALLOC_USERDATA) { 1159 struct xfs_buf *bp; 1160 1161 error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp, 1162 XFS_AGB_TO_DADDR(args->mp, args->agno, fbno), 1163 args->mp->m_bsize, 0, &bp); 1164 if (error) 1165 goto error; 1166 xfs_trans_binval(args->tp, bp); 1167 } 1168 *fbnop = args->agbno = fbno; 1169 *flenp = args->len = 1; 1170 if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) { 1171 xfs_btree_mark_sick(ccur); 1172 error = -EFSCORRUPTED; 1173 goto error; 1174 } 1175 args->wasfromfl = 1; 1176 trace_xfs_alloc_small_freelist(args); 1177 1178 /* 1179 * If we're feeding an AGFL block to something that doesn't live in the 1180 * free space, we need to clear out the OWN_AG rmap. 1181 */ 1182 error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1, 1183 &XFS_RMAP_OINFO_AG); 1184 if (error) 1185 goto error; 1186 1187 *stat = 0; 1188 return 0; 1189 1190 out: 1191 /* 1192 * Can't do the allocation, give up. 1193 */ 1194 if (flen < args->minlen) { 1195 args->agbno = NULLAGBLOCK; 1196 trace_xfs_alloc_small_notenough(args); 1197 flen = 0; 1198 } 1199 *fbnop = fbno; 1200 *flenp = flen; 1201 *stat = 1; 1202 trace_xfs_alloc_small_done(args); 1203 return 0; 1204 1205 error: 1206 trace_xfs_alloc_small_error(args); 1207 return error; 1208 } 1209 1210 /* 1211 * Allocate a variable extent at exactly agno/bno. 1212 * Extent's length (returned in *len) will be between minlen and maxlen, 1213 * and of the form k * prod + mod unless there's nothing that large. 1214 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it. 1215 */ 1216 STATIC int /* error */ 1217 xfs_alloc_ag_vextent_exact( 1218 xfs_alloc_arg_t *args) /* allocation argument structure */ 1219 { 1220 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr; 1221 struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */ 1222 struct xfs_btree_cur *cnt_cur;/* by count btree cursor */ 1223 int error; 1224 xfs_agblock_t fbno; /* start block of found extent */ 1225 xfs_extlen_t flen; /* length of found extent */ 1226 xfs_agblock_t tbno; /* start block of busy extent */ 1227 xfs_extlen_t tlen; /* length of busy extent */ 1228 xfs_agblock_t tend; /* end block of busy extent */ 1229 int i; /* success/failure of operation */ 1230 unsigned busy_gen; 1231 1232 ASSERT(args->alignment == 1); 1233 1234 /* 1235 * Allocate/initialize a cursor for the by-number freespace btree. 1236 */ 1237 bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp, 1238 args->pag); 1239 1240 /* 1241 * Lookup bno and minlen in the btree (minlen is irrelevant, really). 1242 * Look for the closest free block <= bno, it must contain bno 1243 * if any free block does. 1244 */ 1245 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i); 1246 if (error) 1247 goto error0; 1248 if (!i) 1249 goto not_found; 1250 1251 /* 1252 * Grab the freespace record. 1253 */ 1254 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); 1255 if (error) 1256 goto error0; 1257 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1258 xfs_btree_mark_sick(bno_cur); 1259 error = -EFSCORRUPTED; 1260 goto error0; 1261 } 1262 ASSERT(fbno <= args->agbno); 1263 1264 /* 1265 * Check for overlapping busy extents. 1266 */ 1267 tbno = fbno; 1268 tlen = flen; 1269 xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen); 1270 1271 /* 1272 * Give up if the start of the extent is busy, or the freespace isn't 1273 * long enough for the minimum request. 1274 */ 1275 if (tbno > args->agbno) 1276 goto not_found; 1277 if (tlen < args->minlen) 1278 goto not_found; 1279 tend = tbno + tlen; 1280 if (tend < args->agbno + args->minlen) 1281 goto not_found; 1282 1283 /* 1284 * End of extent will be smaller of the freespace end and the 1285 * maximal requested end. 1286 * 1287 * Fix the length according to mod and prod if given. 1288 */ 1289 args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen) 1290 - args->agbno; 1291 xfs_alloc_fix_len(args); 1292 ASSERT(args->agbno + args->len <= tend); 1293 1294 /* 1295 * We are allocating agbno for args->len 1296 * Allocate/initialize a cursor for the by-size btree. 1297 */ 1298 cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, 1299 args->pag); 1300 ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length)); 1301 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, 1302 args->len, XFSA_FIXUP_BNO_OK); 1303 if (error) { 1304 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 1305 goto error0; 1306 } 1307 1308 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1309 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1310 1311 args->wasfromfl = 0; 1312 trace_xfs_alloc_exact_done(args); 1313 return 0; 1314 1315 not_found: 1316 /* Didn't find it, return null. */ 1317 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1318 args->agbno = NULLAGBLOCK; 1319 trace_xfs_alloc_exact_notfound(args); 1320 return 0; 1321 1322 error0: 1323 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1324 trace_xfs_alloc_exact_error(args); 1325 return error; 1326 } 1327 1328 /* 1329 * Search a given number of btree records in a given direction. Check each 1330 * record against the good extent we've already found. 1331 */ 1332 STATIC int 1333 xfs_alloc_walk_iter( 1334 struct xfs_alloc_arg *args, 1335 struct xfs_alloc_cur *acur, 1336 struct xfs_btree_cur *cur, 1337 bool increment, 1338 bool find_one, /* quit on first candidate */ 1339 int count, /* rec count (-1 for infinite) */ 1340 int *stat) 1341 { 1342 int error; 1343 int i; 1344 1345 *stat = 0; 1346 1347 /* 1348 * Search so long as the cursor is active or we find a better extent. 1349 * The cursor is deactivated if it extends beyond the range of the 1350 * current allocation candidate. 1351 */ 1352 while (xfs_alloc_cur_active(cur) && count) { 1353 error = xfs_alloc_cur_check(args, acur, cur, &i); 1354 if (error) 1355 return error; 1356 if (i == 1) { 1357 *stat = 1; 1358 if (find_one) 1359 break; 1360 } 1361 if (!xfs_alloc_cur_active(cur)) 1362 break; 1363 1364 if (increment) 1365 error = xfs_btree_increment(cur, 0, &i); 1366 else 1367 error = xfs_btree_decrement(cur, 0, &i); 1368 if (error) 1369 return error; 1370 if (i == 0) 1371 cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; 1372 1373 if (count > 0) 1374 count--; 1375 } 1376 1377 return 0; 1378 } 1379 1380 /* 1381 * Search the by-bno and by-size btrees in parallel in search of an extent with 1382 * ideal locality based on the NEAR mode ->agbno locality hint. 1383 */ 1384 STATIC int 1385 xfs_alloc_ag_vextent_locality( 1386 struct xfs_alloc_arg *args, 1387 struct xfs_alloc_cur *acur, 1388 int *stat) 1389 { 1390 struct xfs_btree_cur *fbcur = NULL; 1391 int error; 1392 int i; 1393 bool fbinc; 1394 1395 ASSERT(acur->len == 0); 1396 1397 *stat = 0; 1398 1399 error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i); 1400 if (error) 1401 return error; 1402 error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i); 1403 if (error) 1404 return error; 1405 error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i); 1406 if (error) 1407 return error; 1408 1409 /* 1410 * Search the bnobt and cntbt in parallel. Search the bnobt left and 1411 * right and lookup the closest extent to the locality hint for each 1412 * extent size key in the cntbt. The entire search terminates 1413 * immediately on a bnobt hit because that means we've found best case 1414 * locality. Otherwise the search continues until the cntbt cursor runs 1415 * off the end of the tree. If no allocation candidate is found at this 1416 * point, give up on locality, walk backwards from the end of the cntbt 1417 * and take the first available extent. 1418 * 1419 * The parallel tree searches balance each other out to provide fairly 1420 * consistent performance for various situations. The bnobt search can 1421 * have pathological behavior in the worst case scenario of larger 1422 * allocation requests and fragmented free space. On the other hand, the 1423 * bnobt is able to satisfy most smaller allocation requests much more 1424 * quickly than the cntbt. The cntbt search can sift through fragmented 1425 * free space and sets of free extents for larger allocation requests 1426 * more quickly than the bnobt. Since the locality hint is just a hint 1427 * and we don't want to scan the entire bnobt for perfect locality, the 1428 * cntbt search essentially bounds the bnobt search such that we can 1429 * find good enough locality at reasonable performance in most cases. 1430 */ 1431 while (xfs_alloc_cur_active(acur->bnolt) || 1432 xfs_alloc_cur_active(acur->bnogt) || 1433 xfs_alloc_cur_active(acur->cnt)) { 1434 1435 trace_xfs_alloc_cur_lookup(args); 1436 1437 /* 1438 * Search the bnobt left and right. In the case of a hit, finish 1439 * the search in the opposite direction and we're done. 1440 */ 1441 error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false, 1442 true, 1, &i); 1443 if (error) 1444 return error; 1445 if (i == 1) { 1446 trace_xfs_alloc_cur_left(args); 1447 fbcur = acur->bnogt; 1448 fbinc = true; 1449 break; 1450 } 1451 error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true, 1452 1, &i); 1453 if (error) 1454 return error; 1455 if (i == 1) { 1456 trace_xfs_alloc_cur_right(args); 1457 fbcur = acur->bnolt; 1458 fbinc = false; 1459 break; 1460 } 1461 1462 /* 1463 * Check the extent with best locality based on the current 1464 * extent size search key and keep track of the best candidate. 1465 */ 1466 error = xfs_alloc_cntbt_iter(args, acur); 1467 if (error) 1468 return error; 1469 if (!xfs_alloc_cur_active(acur->cnt)) { 1470 trace_xfs_alloc_cur_lookup_done(args); 1471 break; 1472 } 1473 } 1474 1475 /* 1476 * If we failed to find anything due to busy extents, return empty 1477 * handed so the caller can flush and retry. If no busy extents were 1478 * found, walk backwards from the end of the cntbt as a last resort. 1479 */ 1480 if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) { 1481 error = xfs_btree_decrement(acur->cnt, 0, &i); 1482 if (error) 1483 return error; 1484 if (i) { 1485 acur->cnt->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE; 1486 fbcur = acur->cnt; 1487 fbinc = false; 1488 } 1489 } 1490 1491 /* 1492 * Search in the opposite direction for a better entry in the case of 1493 * a bnobt hit or walk backwards from the end of the cntbt. 1494 */ 1495 if (fbcur) { 1496 error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1, 1497 &i); 1498 if (error) 1499 return error; 1500 } 1501 1502 if (acur->len) 1503 *stat = 1; 1504 1505 return 0; 1506 } 1507 1508 /* Check the last block of the cnt btree for allocations. */ 1509 static int 1510 xfs_alloc_ag_vextent_lastblock( 1511 struct xfs_alloc_arg *args, 1512 struct xfs_alloc_cur *acur, 1513 xfs_agblock_t *bno, 1514 xfs_extlen_t *len, 1515 bool *allocated) 1516 { 1517 int error; 1518 int i; 1519 1520 #ifdef DEBUG 1521 /* Randomly don't execute the first algorithm. */ 1522 if (get_random_u32_below(2)) 1523 return 0; 1524 #endif 1525 1526 /* 1527 * Start from the entry that lookup found, sequence through all larger 1528 * free blocks. If we're actually pointing at a record smaller than 1529 * maxlen, go to the start of this block, and skip all those smaller 1530 * than minlen. 1531 */ 1532 if (*len || args->alignment > 1) { 1533 acur->cnt->bc_levels[0].ptr = 1; 1534 do { 1535 error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); 1536 if (error) 1537 return error; 1538 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1539 xfs_btree_mark_sick(acur->cnt); 1540 return -EFSCORRUPTED; 1541 } 1542 if (*len >= args->minlen) 1543 break; 1544 error = xfs_btree_increment(acur->cnt, 0, &i); 1545 if (error) 1546 return error; 1547 } while (i); 1548 ASSERT(*len >= args->minlen); 1549 if (!i) 1550 return 0; 1551 } 1552 1553 error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i); 1554 if (error) 1555 return error; 1556 1557 /* 1558 * It didn't work. We COULD be in a case where there's a good record 1559 * somewhere, so try again. 1560 */ 1561 if (acur->len == 0) 1562 return 0; 1563 1564 trace_xfs_alloc_near_first(args); 1565 *allocated = true; 1566 return 0; 1567 } 1568 1569 /* 1570 * Allocate a variable extent near bno in the allocation group agno. 1571 * Extent's length (returned in len) will be between minlen and maxlen, 1572 * and of the form k * prod + mod unless there's nothing that large. 1573 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1574 */ 1575 STATIC int 1576 xfs_alloc_ag_vextent_near( 1577 struct xfs_alloc_arg *args, 1578 uint32_t alloc_flags) 1579 { 1580 struct xfs_alloc_cur acur = {}; 1581 int error; /* error code */ 1582 int i; /* result code, temporary */ 1583 xfs_agblock_t bno; 1584 xfs_extlen_t len; 1585 1586 /* handle uninitialized agbno range so caller doesn't have to */ 1587 if (!args->min_agbno && !args->max_agbno) 1588 args->max_agbno = args->mp->m_sb.sb_agblocks - 1; 1589 ASSERT(args->min_agbno <= args->max_agbno); 1590 1591 /* clamp agbno to the range if it's outside */ 1592 if (args->agbno < args->min_agbno) 1593 args->agbno = args->min_agbno; 1594 if (args->agbno > args->max_agbno) 1595 args->agbno = args->max_agbno; 1596 1597 /* Retry once quickly if we find busy extents before blocking. */ 1598 alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH; 1599 restart: 1600 len = 0; 1601 1602 /* 1603 * Set up cursors and see if there are any free extents as big as 1604 * maxlen. If not, pick the last entry in the tree unless the tree is 1605 * empty. 1606 */ 1607 error = xfs_alloc_cur_setup(args, &acur); 1608 if (error == -ENOSPC) { 1609 error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno, 1610 &len, &i); 1611 if (error) 1612 goto out; 1613 if (i == 0 || len == 0) { 1614 trace_xfs_alloc_near_noentry(args); 1615 goto out; 1616 } 1617 ASSERT(i == 1); 1618 } else if (error) { 1619 goto out; 1620 } 1621 1622 /* 1623 * First algorithm. 1624 * If the requested extent is large wrt the freespaces available 1625 * in this a.g., then the cursor will be pointing to a btree entry 1626 * near the right edge of the tree. If it's in the last btree leaf 1627 * block, then we just examine all the entries in that block 1628 * that are big enough, and pick the best one. 1629 */ 1630 if (xfs_btree_islastblock(acur.cnt, 0)) { 1631 bool allocated = false; 1632 1633 error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len, 1634 &allocated); 1635 if (error) 1636 goto out; 1637 if (allocated) 1638 goto alloc_finish; 1639 } 1640 1641 /* 1642 * Second algorithm. Combined cntbt and bnobt search to find ideal 1643 * locality. 1644 */ 1645 error = xfs_alloc_ag_vextent_locality(args, &acur, &i); 1646 if (error) 1647 goto out; 1648 1649 /* 1650 * If we couldn't get anything, give up. 1651 */ 1652 if (!acur.len) { 1653 if (acur.busy) { 1654 /* 1655 * Our only valid extents must have been busy. Flush and 1656 * retry the allocation again. If we get an -EAGAIN 1657 * error, we're being told that a deadlock was avoided 1658 * and the current transaction needs committing before 1659 * the allocation can be retried. 1660 */ 1661 trace_xfs_alloc_near_busy(args); 1662 error = xfs_extent_busy_flush(args->tp, args->pag, 1663 acur.busy_gen, alloc_flags); 1664 if (error) 1665 goto out; 1666 1667 alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; 1668 goto restart; 1669 } 1670 trace_xfs_alloc_size_neither(args); 1671 args->agbno = NULLAGBLOCK; 1672 goto out; 1673 } 1674 1675 alloc_finish: 1676 /* fix up btrees on a successful allocation */ 1677 error = xfs_alloc_cur_finish(args, &acur); 1678 1679 out: 1680 xfs_alloc_cur_close(&acur, error); 1681 return error; 1682 } 1683 1684 /* 1685 * Allocate a variable extent anywhere in the allocation group agno. 1686 * Extent's length (returned in len) will be between minlen and maxlen, 1687 * and of the form k * prod + mod unless there's nothing that large. 1688 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1689 */ 1690 static int 1691 xfs_alloc_ag_vextent_size( 1692 struct xfs_alloc_arg *args, 1693 uint32_t alloc_flags) 1694 { 1695 struct xfs_agf *agf = args->agbp->b_addr; 1696 struct xfs_btree_cur *bno_cur; 1697 struct xfs_btree_cur *cnt_cur; 1698 xfs_agblock_t fbno; /* start of found freespace */ 1699 xfs_extlen_t flen; /* length of found freespace */ 1700 xfs_agblock_t rbno; /* returned block number */ 1701 xfs_extlen_t rlen; /* length of returned extent */ 1702 bool busy; 1703 unsigned busy_gen; 1704 int error; 1705 int i; 1706 1707 /* Retry once quickly if we find busy extents before blocking. */ 1708 alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH; 1709 restart: 1710 /* 1711 * Allocate and initialize a cursor for the by-size btree. 1712 */ 1713 cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, 1714 args->pag); 1715 bno_cur = NULL; 1716 1717 /* 1718 * Look for an entry >= maxlen+alignment-1 blocks. 1719 */ 1720 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, 1721 args->maxlen + args->alignment - 1, &i))) 1722 goto error0; 1723 1724 /* 1725 * If none then we have to settle for a smaller extent. In the case that 1726 * there are no large extents, this will return the last entry in the 1727 * tree unless the tree is empty. In the case that there are only busy 1728 * large extents, this will return the largest small extent unless there 1729 * are no smaller extents available. 1730 */ 1731 if (!i) { 1732 error = xfs_alloc_ag_vextent_small(args, cnt_cur, 1733 &fbno, &flen, &i); 1734 if (error) 1735 goto error0; 1736 if (i == 0 || flen == 0) { 1737 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1738 trace_xfs_alloc_size_noentry(args); 1739 return 0; 1740 } 1741 ASSERT(i == 1); 1742 busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, 1743 &rlen, &busy_gen); 1744 } else { 1745 /* 1746 * Search for a non-busy extent that is large enough. 1747 */ 1748 for (;;) { 1749 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); 1750 if (error) 1751 goto error0; 1752 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1753 xfs_btree_mark_sick(cnt_cur); 1754 error = -EFSCORRUPTED; 1755 goto error0; 1756 } 1757 1758 busy = xfs_alloc_compute_aligned(args, fbno, flen, 1759 &rbno, &rlen, &busy_gen); 1760 1761 if (rlen >= args->maxlen) 1762 break; 1763 1764 error = xfs_btree_increment(cnt_cur, 0, &i); 1765 if (error) 1766 goto error0; 1767 if (i) 1768 continue; 1769 1770 /* 1771 * Our only valid extents must have been busy. Flush and 1772 * retry the allocation again. If we get an -EAGAIN 1773 * error, we're being told that a deadlock was avoided 1774 * and the current transaction needs committing before 1775 * the allocation can be retried. 1776 */ 1777 trace_xfs_alloc_size_busy(args); 1778 error = xfs_extent_busy_flush(args->tp, args->pag, 1779 busy_gen, alloc_flags); 1780 if (error) 1781 goto error0; 1782 1783 alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; 1784 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1785 goto restart; 1786 } 1787 } 1788 1789 /* 1790 * In the first case above, we got the last entry in the 1791 * by-size btree. Now we check to see if the space hits maxlen 1792 * once aligned; if not, we search left for something better. 1793 * This can't happen in the second case above. 1794 */ 1795 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1796 if (XFS_IS_CORRUPT(args->mp, 1797 rlen != 0 && 1798 (rlen > flen || 1799 rbno + rlen > fbno + flen))) { 1800 xfs_btree_mark_sick(cnt_cur); 1801 error = -EFSCORRUPTED; 1802 goto error0; 1803 } 1804 if (rlen < args->maxlen) { 1805 xfs_agblock_t bestfbno; 1806 xfs_extlen_t bestflen; 1807 xfs_agblock_t bestrbno; 1808 xfs_extlen_t bestrlen; 1809 1810 bestrlen = rlen; 1811 bestrbno = rbno; 1812 bestflen = flen; 1813 bestfbno = fbno; 1814 for (;;) { 1815 if ((error = xfs_btree_decrement(cnt_cur, 0, &i))) 1816 goto error0; 1817 if (i == 0) 1818 break; 1819 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, 1820 &i))) 1821 goto error0; 1822 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1823 xfs_btree_mark_sick(cnt_cur); 1824 error = -EFSCORRUPTED; 1825 goto error0; 1826 } 1827 if (flen < bestrlen) 1828 break; 1829 busy = xfs_alloc_compute_aligned(args, fbno, flen, 1830 &rbno, &rlen, &busy_gen); 1831 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1832 if (XFS_IS_CORRUPT(args->mp, 1833 rlen != 0 && 1834 (rlen > flen || 1835 rbno + rlen > fbno + flen))) { 1836 xfs_btree_mark_sick(cnt_cur); 1837 error = -EFSCORRUPTED; 1838 goto error0; 1839 } 1840 if (rlen > bestrlen) { 1841 bestrlen = rlen; 1842 bestrbno = rbno; 1843 bestflen = flen; 1844 bestfbno = fbno; 1845 if (rlen == args->maxlen) 1846 break; 1847 } 1848 } 1849 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, 1850 &i))) 1851 goto error0; 1852 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1853 xfs_btree_mark_sick(cnt_cur); 1854 error = -EFSCORRUPTED; 1855 goto error0; 1856 } 1857 rlen = bestrlen; 1858 rbno = bestrbno; 1859 flen = bestflen; 1860 fbno = bestfbno; 1861 } 1862 args->wasfromfl = 0; 1863 /* 1864 * Fix up the length. 1865 */ 1866 args->len = rlen; 1867 if (rlen < args->minlen) { 1868 if (busy) { 1869 /* 1870 * Our only valid extents must have been busy. Flush and 1871 * retry the allocation again. If we get an -EAGAIN 1872 * error, we're being told that a deadlock was avoided 1873 * and the current transaction needs committing before 1874 * the allocation can be retried. 1875 */ 1876 trace_xfs_alloc_size_busy(args); 1877 error = xfs_extent_busy_flush(args->tp, args->pag, 1878 busy_gen, alloc_flags); 1879 if (error) 1880 goto error0; 1881 1882 alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; 1883 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1884 goto restart; 1885 } 1886 goto out_nominleft; 1887 } 1888 xfs_alloc_fix_len(args); 1889 1890 rlen = args->len; 1891 if (XFS_IS_CORRUPT(args->mp, rlen > flen)) { 1892 xfs_btree_mark_sick(cnt_cur); 1893 error = -EFSCORRUPTED; 1894 goto error0; 1895 } 1896 /* 1897 * Allocate and initialize a cursor for the by-block tree. 1898 */ 1899 bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp, 1900 args->pag); 1901 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, 1902 rbno, rlen, XFSA_FIXUP_CNT_OK))) 1903 goto error0; 1904 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1905 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1906 cnt_cur = bno_cur = NULL; 1907 args->len = rlen; 1908 args->agbno = rbno; 1909 if (XFS_IS_CORRUPT(args->mp, 1910 args->agbno + args->len > 1911 be32_to_cpu(agf->agf_length))) { 1912 xfs_ag_mark_sick(args->pag, XFS_SICK_AG_BNOBT); 1913 error = -EFSCORRUPTED; 1914 goto error0; 1915 } 1916 trace_xfs_alloc_size_done(args); 1917 return 0; 1918 1919 error0: 1920 trace_xfs_alloc_size_error(args); 1921 if (cnt_cur) 1922 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 1923 if (bno_cur) 1924 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1925 return error; 1926 1927 out_nominleft: 1928 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1929 trace_xfs_alloc_size_nominleft(args); 1930 args->agbno = NULLAGBLOCK; 1931 return 0; 1932 } 1933 1934 /* 1935 * Free the extent starting at agno/bno for length. 1936 */ 1937 STATIC int 1938 xfs_free_ag_extent( 1939 struct xfs_trans *tp, 1940 struct xfs_buf *agbp, 1941 xfs_agnumber_t agno, 1942 xfs_agblock_t bno, 1943 xfs_extlen_t len, 1944 const struct xfs_owner_info *oinfo, 1945 enum xfs_ag_resv_type type) 1946 { 1947 struct xfs_mount *mp; 1948 struct xfs_btree_cur *bno_cur; 1949 struct xfs_btree_cur *cnt_cur; 1950 xfs_agblock_t gtbno; /* start of right neighbor */ 1951 xfs_extlen_t gtlen; /* length of right neighbor */ 1952 xfs_agblock_t ltbno; /* start of left neighbor */ 1953 xfs_extlen_t ltlen; /* length of left neighbor */ 1954 xfs_agblock_t nbno; /* new starting block of freesp */ 1955 xfs_extlen_t nlen; /* new length of freespace */ 1956 int haveleft; /* have a left neighbor */ 1957 int haveright; /* have a right neighbor */ 1958 int i; 1959 int error; 1960 struct xfs_perag *pag = agbp->b_pag; 1961 1962 bno_cur = cnt_cur = NULL; 1963 mp = tp->t_mountp; 1964 1965 if (!xfs_rmap_should_skip_owner_update(oinfo)) { 1966 error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo); 1967 if (error) 1968 goto error0; 1969 } 1970 1971 /* 1972 * Allocate and initialize a cursor for the by-block btree. 1973 */ 1974 bno_cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); 1975 /* 1976 * Look for a neighboring block on the left (lower block numbers) 1977 * that is contiguous with this space. 1978 */ 1979 if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft))) 1980 goto error0; 1981 if (haveleft) { 1982 /* 1983 * There is a block to our left. 1984 */ 1985 if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) 1986 goto error0; 1987 if (XFS_IS_CORRUPT(mp, i != 1)) { 1988 xfs_btree_mark_sick(bno_cur); 1989 error = -EFSCORRUPTED; 1990 goto error0; 1991 } 1992 /* 1993 * It's not contiguous, though. 1994 */ 1995 if (ltbno + ltlen < bno) 1996 haveleft = 0; 1997 else { 1998 /* 1999 * If this failure happens the request to free this 2000 * space was invalid, it's (partly) already free. 2001 * Very bad. 2002 */ 2003 if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) { 2004 xfs_btree_mark_sick(bno_cur); 2005 error = -EFSCORRUPTED; 2006 goto error0; 2007 } 2008 } 2009 } 2010 /* 2011 * Look for a neighboring block on the right (higher block numbers) 2012 * that is contiguous with this space. 2013 */ 2014 if ((error = xfs_btree_increment(bno_cur, 0, &haveright))) 2015 goto error0; 2016 if (haveright) { 2017 /* 2018 * There is a block to our right. 2019 */ 2020 if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) 2021 goto error0; 2022 if (XFS_IS_CORRUPT(mp, i != 1)) { 2023 xfs_btree_mark_sick(bno_cur); 2024 error = -EFSCORRUPTED; 2025 goto error0; 2026 } 2027 /* 2028 * It's not contiguous, though. 2029 */ 2030 if (bno + len < gtbno) 2031 haveright = 0; 2032 else { 2033 /* 2034 * If this failure happens the request to free this 2035 * space was invalid, it's (partly) already free. 2036 * Very bad. 2037 */ 2038 if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) { 2039 xfs_btree_mark_sick(bno_cur); 2040 error = -EFSCORRUPTED; 2041 goto error0; 2042 } 2043 } 2044 } 2045 /* 2046 * Now allocate and initialize a cursor for the by-size tree. 2047 */ 2048 cnt_cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); 2049 /* 2050 * Have both left and right contiguous neighbors. 2051 * Merge all three into a single free block. 2052 */ 2053 if (haveleft && haveright) { 2054 /* 2055 * Delete the old by-size entry on the left. 2056 */ 2057 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 2058 goto error0; 2059 if (XFS_IS_CORRUPT(mp, i != 1)) { 2060 xfs_btree_mark_sick(cnt_cur); 2061 error = -EFSCORRUPTED; 2062 goto error0; 2063 } 2064 if ((error = xfs_btree_delete(cnt_cur, &i))) 2065 goto error0; 2066 if (XFS_IS_CORRUPT(mp, i != 1)) { 2067 xfs_btree_mark_sick(cnt_cur); 2068 error = -EFSCORRUPTED; 2069 goto error0; 2070 } 2071 /* 2072 * Delete the old by-size entry on the right. 2073 */ 2074 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 2075 goto error0; 2076 if (XFS_IS_CORRUPT(mp, i != 1)) { 2077 xfs_btree_mark_sick(cnt_cur); 2078 error = -EFSCORRUPTED; 2079 goto error0; 2080 } 2081 if ((error = xfs_btree_delete(cnt_cur, &i))) 2082 goto error0; 2083 if (XFS_IS_CORRUPT(mp, i != 1)) { 2084 xfs_btree_mark_sick(cnt_cur); 2085 error = -EFSCORRUPTED; 2086 goto error0; 2087 } 2088 /* 2089 * Delete the old by-block entry for the right block. 2090 */ 2091 if ((error = xfs_btree_delete(bno_cur, &i))) 2092 goto error0; 2093 if (XFS_IS_CORRUPT(mp, i != 1)) { 2094 xfs_btree_mark_sick(bno_cur); 2095 error = -EFSCORRUPTED; 2096 goto error0; 2097 } 2098 /* 2099 * Move the by-block cursor back to the left neighbor. 2100 */ 2101 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 2102 goto error0; 2103 if (XFS_IS_CORRUPT(mp, i != 1)) { 2104 xfs_btree_mark_sick(bno_cur); 2105 error = -EFSCORRUPTED; 2106 goto error0; 2107 } 2108 #ifdef DEBUG 2109 /* 2110 * Check that this is the right record: delete didn't 2111 * mangle the cursor. 2112 */ 2113 { 2114 xfs_agblock_t xxbno; 2115 xfs_extlen_t xxlen; 2116 2117 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, 2118 &i))) 2119 goto error0; 2120 if (XFS_IS_CORRUPT(mp, 2121 i != 1 || 2122 xxbno != ltbno || 2123 xxlen != ltlen)) { 2124 xfs_btree_mark_sick(bno_cur); 2125 error = -EFSCORRUPTED; 2126 goto error0; 2127 } 2128 } 2129 #endif 2130 /* 2131 * Update remaining by-block entry to the new, joined block. 2132 */ 2133 nbno = ltbno; 2134 nlen = len + ltlen + gtlen; 2135 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2136 goto error0; 2137 } 2138 /* 2139 * Have only a left contiguous neighbor. 2140 * Merge it together with the new freespace. 2141 */ 2142 else if (haveleft) { 2143 /* 2144 * Delete the old by-size entry on the left. 2145 */ 2146 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 2147 goto error0; 2148 if (XFS_IS_CORRUPT(mp, i != 1)) { 2149 xfs_btree_mark_sick(cnt_cur); 2150 error = -EFSCORRUPTED; 2151 goto error0; 2152 } 2153 if ((error = xfs_btree_delete(cnt_cur, &i))) 2154 goto error0; 2155 if (XFS_IS_CORRUPT(mp, i != 1)) { 2156 xfs_btree_mark_sick(cnt_cur); 2157 error = -EFSCORRUPTED; 2158 goto error0; 2159 } 2160 /* 2161 * Back up the by-block cursor to the left neighbor, and 2162 * update its length. 2163 */ 2164 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 2165 goto error0; 2166 if (XFS_IS_CORRUPT(mp, i != 1)) { 2167 xfs_btree_mark_sick(bno_cur); 2168 error = -EFSCORRUPTED; 2169 goto error0; 2170 } 2171 nbno = ltbno; 2172 nlen = len + ltlen; 2173 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2174 goto error0; 2175 } 2176 /* 2177 * Have only a right contiguous neighbor. 2178 * Merge it together with the new freespace. 2179 */ 2180 else if (haveright) { 2181 /* 2182 * Delete the old by-size entry on the right. 2183 */ 2184 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 2185 goto error0; 2186 if (XFS_IS_CORRUPT(mp, i != 1)) { 2187 xfs_btree_mark_sick(cnt_cur); 2188 error = -EFSCORRUPTED; 2189 goto error0; 2190 } 2191 if ((error = xfs_btree_delete(cnt_cur, &i))) 2192 goto error0; 2193 if (XFS_IS_CORRUPT(mp, i != 1)) { 2194 xfs_btree_mark_sick(cnt_cur); 2195 error = -EFSCORRUPTED; 2196 goto error0; 2197 } 2198 /* 2199 * Update the starting block and length of the right 2200 * neighbor in the by-block tree. 2201 */ 2202 nbno = bno; 2203 nlen = len + gtlen; 2204 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2205 goto error0; 2206 } 2207 /* 2208 * No contiguous neighbors. 2209 * Insert the new freespace into the by-block tree. 2210 */ 2211 else { 2212 nbno = bno; 2213 nlen = len; 2214 if ((error = xfs_btree_insert(bno_cur, &i))) 2215 goto error0; 2216 if (XFS_IS_CORRUPT(mp, i != 1)) { 2217 xfs_btree_mark_sick(bno_cur); 2218 error = -EFSCORRUPTED; 2219 goto error0; 2220 } 2221 } 2222 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 2223 bno_cur = NULL; 2224 /* 2225 * In all cases we need to insert the new freespace in the by-size tree. 2226 */ 2227 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) 2228 goto error0; 2229 if (XFS_IS_CORRUPT(mp, i != 0)) { 2230 xfs_btree_mark_sick(cnt_cur); 2231 error = -EFSCORRUPTED; 2232 goto error0; 2233 } 2234 if ((error = xfs_btree_insert(cnt_cur, &i))) 2235 goto error0; 2236 if (XFS_IS_CORRUPT(mp, i != 1)) { 2237 xfs_btree_mark_sick(cnt_cur); 2238 error = -EFSCORRUPTED; 2239 goto error0; 2240 } 2241 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 2242 cnt_cur = NULL; 2243 2244 /* 2245 * Update the freespace totals in the ag and superblock. 2246 */ 2247 error = xfs_alloc_update_counters(tp, agbp, len); 2248 xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len); 2249 if (error) 2250 goto error0; 2251 2252 XFS_STATS_INC(mp, xs_freex); 2253 XFS_STATS_ADD(mp, xs_freeb, len); 2254 2255 trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright); 2256 2257 return 0; 2258 2259 error0: 2260 trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1); 2261 if (bno_cur) 2262 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 2263 if (cnt_cur) 2264 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 2265 return error; 2266 } 2267 2268 /* 2269 * Visible (exported) allocation/free functions. 2270 * Some of these are used just by xfs_alloc_btree.c and this file. 2271 */ 2272 2273 /* 2274 * Compute and fill in value of m_alloc_maxlevels. 2275 */ 2276 void 2277 xfs_alloc_compute_maxlevels( 2278 xfs_mount_t *mp) /* file system mount structure */ 2279 { 2280 mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr, 2281 (mp->m_sb.sb_agblocks + 1) / 2); 2282 ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk()); 2283 } 2284 2285 /* 2286 * Find the length of the longest extent in an AG. The 'need' parameter 2287 * specifies how much space we're going to need for the AGFL and the 2288 * 'reserved' parameter tells us how many blocks in this AG are reserved for 2289 * other callers. 2290 */ 2291 xfs_extlen_t 2292 xfs_alloc_longest_free_extent( 2293 struct xfs_perag *pag, 2294 xfs_extlen_t need, 2295 xfs_extlen_t reserved) 2296 { 2297 xfs_extlen_t delta = 0; 2298 2299 /* 2300 * If the AGFL needs a recharge, we'll have to subtract that from the 2301 * longest extent. 2302 */ 2303 if (need > pag->pagf_flcount) 2304 delta = need - pag->pagf_flcount; 2305 2306 /* 2307 * If we cannot maintain others' reservations with space from the 2308 * not-longest freesp extents, we'll have to subtract /that/ from 2309 * the longest extent too. 2310 */ 2311 if (pag->pagf_freeblks - pag->pagf_longest < reserved) 2312 delta += reserved - (pag->pagf_freeblks - pag->pagf_longest); 2313 2314 /* 2315 * If the longest extent is long enough to satisfy all the 2316 * reservations and AGFL rules in place, we can return this extent. 2317 */ 2318 if (pag->pagf_longest > delta) 2319 return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable, 2320 pag->pagf_longest - delta); 2321 2322 /* Otherwise, let the caller try for 1 block if there's space. */ 2323 return pag->pagf_flcount > 0 || pag->pagf_longest > 0; 2324 } 2325 2326 /* 2327 * Compute the minimum length of the AGFL in the given AG. If @pag is NULL, 2328 * return the largest possible minimum length. 2329 */ 2330 unsigned int 2331 xfs_alloc_min_freelist( 2332 struct xfs_mount *mp, 2333 struct xfs_perag *pag) 2334 { 2335 /* AG btrees have at least 1 level. */ 2336 const unsigned int bno_level = pag ? pag->pagf_bno_level : 1; 2337 const unsigned int cnt_level = pag ? pag->pagf_cnt_level : 1; 2338 const unsigned int rmap_level = pag ? pag->pagf_rmap_level : 1; 2339 unsigned int min_free; 2340 2341 ASSERT(mp->m_alloc_maxlevels > 0); 2342 2343 /* 2344 * For a btree shorter than the maximum height, the worst case is that 2345 * every level gets split and a new level is added, then while inserting 2346 * another entry to refill the AGFL, every level under the old root gets 2347 * split again. This is: 2348 * 2349 * (full height split reservation) + (AGFL refill split height) 2350 * = (current height + 1) + (current height - 1) 2351 * = (new height) + (new height - 2) 2352 * = 2 * new height - 2 2353 * 2354 * For a btree of maximum height, the worst case is that every level 2355 * under the root gets split, then while inserting another entry to 2356 * refill the AGFL, every level under the root gets split again. This is 2357 * also: 2358 * 2359 * 2 * (current height - 1) 2360 * = 2 * (new height - 1) 2361 * = 2 * new height - 2 2362 */ 2363 2364 /* space needed by-bno freespace btree */ 2365 min_free = min(bno_level + 1, mp->m_alloc_maxlevels) * 2 - 2; 2366 /* space needed by-size freespace btree */ 2367 min_free += min(cnt_level + 1, mp->m_alloc_maxlevels) * 2 - 2; 2368 /* space needed reverse mapping used space btree */ 2369 if (xfs_has_rmapbt(mp)) 2370 min_free += min(rmap_level + 1, mp->m_rmap_maxlevels) * 2 - 2; 2371 return min_free; 2372 } 2373 2374 /* 2375 * Check if the operation we are fixing up the freelist for should go ahead or 2376 * not. If we are freeing blocks, we always allow it, otherwise the allocation 2377 * is dependent on whether the size and shape of free space available will 2378 * permit the requested allocation to take place. 2379 */ 2380 static bool 2381 xfs_alloc_space_available( 2382 struct xfs_alloc_arg *args, 2383 xfs_extlen_t min_free, 2384 int flags) 2385 { 2386 struct xfs_perag *pag = args->pag; 2387 xfs_extlen_t alloc_len, longest; 2388 xfs_extlen_t reservation; /* blocks that are still reserved */ 2389 int available; 2390 xfs_extlen_t agflcount; 2391 2392 if (flags & XFS_ALLOC_FLAG_FREEING) 2393 return true; 2394 2395 reservation = xfs_ag_resv_needed(pag, args->resv); 2396 2397 /* do we have enough contiguous free space for the allocation? */ 2398 alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop; 2399 longest = xfs_alloc_longest_free_extent(pag, min_free, reservation); 2400 if (longest < alloc_len) 2401 return false; 2402 2403 /* 2404 * Do we have enough free space remaining for the allocation? Don't 2405 * account extra agfl blocks because we are about to defer free them, 2406 * making them unavailable until the current transaction commits. 2407 */ 2408 agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free); 2409 available = (int)(pag->pagf_freeblks + agflcount - 2410 reservation - min_free - args->minleft); 2411 if (available < (int)max(args->total, alloc_len)) 2412 return false; 2413 2414 /* 2415 * Clamp maxlen to the amount of free space available for the actual 2416 * extent allocation. 2417 */ 2418 if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) { 2419 args->maxlen = available; 2420 ASSERT(args->maxlen > 0); 2421 ASSERT(args->maxlen >= args->minlen); 2422 } 2423 2424 return true; 2425 } 2426 2427 int 2428 xfs_free_agfl_block( 2429 struct xfs_trans *tp, 2430 xfs_agnumber_t agno, 2431 xfs_agblock_t agbno, 2432 struct xfs_buf *agbp, 2433 struct xfs_owner_info *oinfo) 2434 { 2435 int error; 2436 struct xfs_buf *bp; 2437 2438 error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo, 2439 XFS_AG_RESV_AGFL); 2440 if (error) 2441 return error; 2442 2443 error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp, 2444 XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno), 2445 tp->t_mountp->m_bsize, 0, &bp); 2446 if (error) 2447 return error; 2448 xfs_trans_binval(tp, bp); 2449 2450 return 0; 2451 } 2452 2453 /* 2454 * Check the agfl fields of the agf for inconsistency or corruption. 2455 * 2456 * The original purpose was to detect an agfl header padding mismatch between 2457 * current and early v5 kernels. This problem manifests as a 1-slot size 2458 * difference between the on-disk flcount and the active [first, last] range of 2459 * a wrapped agfl. 2460 * 2461 * However, we need to use these same checks to catch agfl count corruptions 2462 * unrelated to padding. This could occur on any v4 or v5 filesystem, so either 2463 * way, we need to reset the agfl and warn the user. 2464 * 2465 * Return true if a reset is required before the agfl can be used, false 2466 * otherwise. 2467 */ 2468 static bool 2469 xfs_agfl_needs_reset( 2470 struct xfs_mount *mp, 2471 struct xfs_agf *agf) 2472 { 2473 uint32_t f = be32_to_cpu(agf->agf_flfirst); 2474 uint32_t l = be32_to_cpu(agf->agf_fllast); 2475 uint32_t c = be32_to_cpu(agf->agf_flcount); 2476 int agfl_size = xfs_agfl_size(mp); 2477 int active; 2478 2479 /* 2480 * The agf read verifier catches severe corruption of these fields. 2481 * Repeat some sanity checks to cover a packed -> unpacked mismatch if 2482 * the verifier allows it. 2483 */ 2484 if (f >= agfl_size || l >= agfl_size) 2485 return true; 2486 if (c > agfl_size) 2487 return true; 2488 2489 /* 2490 * Check consistency between the on-disk count and the active range. An 2491 * agfl padding mismatch manifests as an inconsistent flcount. 2492 */ 2493 if (c && l >= f) 2494 active = l - f + 1; 2495 else if (c) 2496 active = agfl_size - f + l + 1; 2497 else 2498 active = 0; 2499 2500 return active != c; 2501 } 2502 2503 /* 2504 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the 2505 * agfl content cannot be trusted. Warn the user that a repair is required to 2506 * recover leaked blocks. 2507 * 2508 * The purpose of this mechanism is to handle filesystems affected by the agfl 2509 * header padding mismatch problem. A reset keeps the filesystem online with a 2510 * relatively minor free space accounting inconsistency rather than suffer the 2511 * inevitable crash from use of an invalid agfl block. 2512 */ 2513 static void 2514 xfs_agfl_reset( 2515 struct xfs_trans *tp, 2516 struct xfs_buf *agbp, 2517 struct xfs_perag *pag) 2518 { 2519 struct xfs_mount *mp = tp->t_mountp; 2520 struct xfs_agf *agf = agbp->b_addr; 2521 2522 ASSERT(xfs_perag_agfl_needs_reset(pag)); 2523 trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); 2524 2525 xfs_warn(mp, 2526 "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. " 2527 "Please unmount and run xfs_repair.", 2528 pag->pag_agno, pag->pagf_flcount); 2529 2530 agf->agf_flfirst = 0; 2531 agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1); 2532 agf->agf_flcount = 0; 2533 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST | 2534 XFS_AGF_FLCOUNT); 2535 2536 pag->pagf_flcount = 0; 2537 clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); 2538 } 2539 2540 /* 2541 * Defer an AGFL block free. This is effectively equivalent to 2542 * xfs_free_extent_later() with some special handling particular to AGFL blocks. 2543 * 2544 * Deferring AGFL frees helps prevent log reservation overruns due to too many 2545 * allocation operations in a transaction. AGFL frees are prone to this problem 2546 * because for one they are always freed one at a time. Further, an immediate 2547 * AGFL block free can cause a btree join and require another block free before 2548 * the real allocation can proceed. Deferring the free disconnects freeing up 2549 * the AGFL slot from freeing the block. 2550 */ 2551 static int 2552 xfs_defer_agfl_block( 2553 struct xfs_trans *tp, 2554 xfs_agnumber_t agno, 2555 xfs_agblock_t agbno, 2556 struct xfs_owner_info *oinfo) 2557 { 2558 struct xfs_mount *mp = tp->t_mountp; 2559 struct xfs_extent_free_item *xefi; 2560 xfs_fsblock_t fsbno = XFS_AGB_TO_FSB(mp, agno, agbno); 2561 2562 ASSERT(xfs_extfree_item_cache != NULL); 2563 ASSERT(oinfo != NULL); 2564 2565 if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno))) 2566 return -EFSCORRUPTED; 2567 2568 xefi = kmem_cache_zalloc(xfs_extfree_item_cache, 2569 GFP_KERNEL | __GFP_NOFAIL); 2570 xefi->xefi_startblock = fsbno; 2571 xefi->xefi_blockcount = 1; 2572 xefi->xefi_owner = oinfo->oi_owner; 2573 xefi->xefi_agresv = XFS_AG_RESV_AGFL; 2574 2575 trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); 2576 2577 xfs_extent_free_get_group(mp, xefi); 2578 xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type); 2579 return 0; 2580 } 2581 2582 /* 2583 * Add the extent to the list of extents to be free at transaction end. 2584 * The list is maintained sorted (by block number). 2585 */ 2586 static int 2587 xfs_defer_extent_free( 2588 struct xfs_trans *tp, 2589 xfs_fsblock_t bno, 2590 xfs_filblks_t len, 2591 const struct xfs_owner_info *oinfo, 2592 enum xfs_ag_resv_type type, 2593 bool skip_discard, 2594 struct xfs_defer_pending **dfpp) 2595 { 2596 struct xfs_extent_free_item *xefi; 2597 struct xfs_mount *mp = tp->t_mountp; 2598 #ifdef DEBUG 2599 xfs_agnumber_t agno; 2600 xfs_agblock_t agbno; 2601 2602 ASSERT(bno != NULLFSBLOCK); 2603 ASSERT(len > 0); 2604 ASSERT(len <= XFS_MAX_BMBT_EXTLEN); 2605 ASSERT(!isnullstartblock(bno)); 2606 agno = XFS_FSB_TO_AGNO(mp, bno); 2607 agbno = XFS_FSB_TO_AGBNO(mp, bno); 2608 ASSERT(agno < mp->m_sb.sb_agcount); 2609 ASSERT(agbno < mp->m_sb.sb_agblocks); 2610 ASSERT(len < mp->m_sb.sb_agblocks); 2611 ASSERT(agbno + len <= mp->m_sb.sb_agblocks); 2612 #endif 2613 ASSERT(xfs_extfree_item_cache != NULL); 2614 ASSERT(type != XFS_AG_RESV_AGFL); 2615 2616 if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) 2617 return -EFSCORRUPTED; 2618 2619 xefi = kmem_cache_zalloc(xfs_extfree_item_cache, 2620 GFP_KERNEL | __GFP_NOFAIL); 2621 xefi->xefi_startblock = bno; 2622 xefi->xefi_blockcount = (xfs_extlen_t)len; 2623 xefi->xefi_agresv = type; 2624 if (skip_discard) 2625 xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; 2626 if (oinfo) { 2627 ASSERT(oinfo->oi_offset == 0); 2628 2629 if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) 2630 xefi->xefi_flags |= XFS_EFI_ATTR_FORK; 2631 if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) 2632 xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK; 2633 xefi->xefi_owner = oinfo->oi_owner; 2634 } else { 2635 xefi->xefi_owner = XFS_RMAP_OWN_NULL; 2636 } 2637 trace_xfs_bmap_free_defer(mp, 2638 XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, 2639 XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); 2640 2641 xfs_extent_free_get_group(mp, xefi); 2642 *dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type); 2643 return 0; 2644 } 2645 2646 int 2647 xfs_free_extent_later( 2648 struct xfs_trans *tp, 2649 xfs_fsblock_t bno, 2650 xfs_filblks_t len, 2651 const struct xfs_owner_info *oinfo, 2652 enum xfs_ag_resv_type type, 2653 bool skip_discard) 2654 { 2655 struct xfs_defer_pending *dontcare = NULL; 2656 2657 return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard, 2658 &dontcare); 2659 } 2660 2661 /* 2662 * Set up automatic freeing of unwritten space in the filesystem. 2663 * 2664 * This function attached a paused deferred extent free item to the 2665 * transaction. Pausing means that the EFI will be logged in the next 2666 * transaction commit, but the pending EFI will not be finished until the 2667 * pending item is unpaused. 2668 * 2669 * If the system goes down after the EFI has been persisted to the log but 2670 * before the pending item is unpaused, log recovery will find the EFI, fail to 2671 * find the EFD, and free the space. 2672 * 2673 * If the pending item is unpaused, the next transaction commit will log an EFD 2674 * without freeing the space. 2675 * 2676 * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the 2677 * @args structure are set to the relevant values. 2678 */ 2679 int 2680 xfs_alloc_schedule_autoreap( 2681 const struct xfs_alloc_arg *args, 2682 bool skip_discard, 2683 struct xfs_alloc_autoreap *aarp) 2684 { 2685 int error; 2686 2687 error = xfs_defer_extent_free(args->tp, args->fsbno, args->len, 2688 &args->oinfo, args->resv, skip_discard, &aarp->dfp); 2689 if (error) 2690 return error; 2691 2692 xfs_defer_item_pause(args->tp, aarp->dfp); 2693 return 0; 2694 } 2695 2696 /* 2697 * Cancel automatic freeing of unwritten space in the filesystem. 2698 * 2699 * Earlier, we created a paused deferred extent free item and attached it to 2700 * this transaction so that we could automatically roll back a new space 2701 * allocation if the system went down. Now we want to cancel the paused work 2702 * item by marking the EFI stale so we don't actually free the space, unpausing 2703 * the pending item and logging an EFD. 2704 * 2705 * The caller generally should have already mapped the space into the ondisk 2706 * filesystem. If the reserved space was partially used, the caller must call 2707 * xfs_free_extent_later to create a new EFI to free the unused space. 2708 */ 2709 void 2710 xfs_alloc_cancel_autoreap( 2711 struct xfs_trans *tp, 2712 struct xfs_alloc_autoreap *aarp) 2713 { 2714 struct xfs_defer_pending *dfp = aarp->dfp; 2715 struct xfs_extent_free_item *xefi; 2716 2717 if (!dfp) 2718 return; 2719 2720 list_for_each_entry(xefi, &dfp->dfp_work, xefi_list) 2721 xefi->xefi_flags |= XFS_EFI_CANCELLED; 2722 2723 xfs_defer_item_unpause(tp, dfp); 2724 } 2725 2726 /* 2727 * Commit automatic freeing of unwritten space in the filesystem. 2728 * 2729 * This unpauses an earlier _schedule_autoreap and commits to freeing the 2730 * allocated space. Call this if none of the reserved space was used. 2731 */ 2732 void 2733 xfs_alloc_commit_autoreap( 2734 struct xfs_trans *tp, 2735 struct xfs_alloc_autoreap *aarp) 2736 { 2737 if (aarp->dfp) 2738 xfs_defer_item_unpause(tp, aarp->dfp); 2739 } 2740 2741 #ifdef DEBUG 2742 /* 2743 * Check if an AGF has a free extent record whose length is equal to 2744 * args->minlen. 2745 */ 2746 STATIC int 2747 xfs_exact_minlen_extent_available( 2748 struct xfs_alloc_arg *args, 2749 struct xfs_buf *agbp, 2750 int *stat) 2751 { 2752 struct xfs_btree_cur *cnt_cur; 2753 xfs_agblock_t fbno; 2754 xfs_extlen_t flen; 2755 int error = 0; 2756 2757 cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, agbp, 2758 args->pag); 2759 error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat); 2760 if (error) 2761 goto out; 2762 2763 if (*stat == 0) { 2764 xfs_btree_mark_sick(cnt_cur); 2765 error = -EFSCORRUPTED; 2766 goto out; 2767 } 2768 2769 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat); 2770 if (error) 2771 goto out; 2772 2773 if (*stat == 1 && flen != args->minlen) 2774 *stat = 0; 2775 2776 out: 2777 xfs_btree_del_cursor(cnt_cur, error); 2778 2779 return error; 2780 } 2781 #endif 2782 2783 /* 2784 * Decide whether to use this allocation group for this allocation. 2785 * If so, fix up the btree freelist's size. 2786 */ 2787 int /* error */ 2788 xfs_alloc_fix_freelist( 2789 struct xfs_alloc_arg *args, /* allocation argument structure */ 2790 uint32_t alloc_flags) 2791 { 2792 struct xfs_mount *mp = args->mp; 2793 struct xfs_perag *pag = args->pag; 2794 struct xfs_trans *tp = args->tp; 2795 struct xfs_buf *agbp = NULL; 2796 struct xfs_buf *agflbp = NULL; 2797 struct xfs_alloc_arg targs; /* local allocation arguments */ 2798 xfs_agblock_t bno; /* freelist block */ 2799 xfs_extlen_t need; /* total blocks needed in freelist */ 2800 int error = 0; 2801 2802 /* deferred ops (AGFL block frees) require permanent transactions */ 2803 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 2804 2805 if (!xfs_perag_initialised_agf(pag)) { 2806 error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp); 2807 if (error) { 2808 /* Couldn't lock the AGF so skip this AG. */ 2809 if (error == -EAGAIN) 2810 error = 0; 2811 goto out_no_agbp; 2812 } 2813 } 2814 2815 /* 2816 * If this is a metadata preferred pag and we are user data then try 2817 * somewhere else if we are not being asked to try harder at this 2818 * point 2819 */ 2820 if (xfs_perag_prefers_metadata(pag) && 2821 (args->datatype & XFS_ALLOC_USERDATA) && 2822 (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)) { 2823 ASSERT(!(alloc_flags & XFS_ALLOC_FLAG_FREEING)); 2824 goto out_agbp_relse; 2825 } 2826 2827 need = xfs_alloc_min_freelist(mp, pag); 2828 if (!xfs_alloc_space_available(args, need, alloc_flags | 2829 XFS_ALLOC_FLAG_CHECK)) 2830 goto out_agbp_relse; 2831 2832 /* 2833 * Get the a.g. freespace buffer. 2834 * Can fail if we're not blocking on locks, and it's held. 2835 */ 2836 if (!agbp) { 2837 error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp); 2838 if (error) { 2839 /* Couldn't lock the AGF so skip this AG. */ 2840 if (error == -EAGAIN) 2841 error = 0; 2842 goto out_no_agbp; 2843 } 2844 } 2845 2846 /* reset a padding mismatched agfl before final free space check */ 2847 if (xfs_perag_agfl_needs_reset(pag)) 2848 xfs_agfl_reset(tp, agbp, pag); 2849 2850 /* If there isn't enough total space or single-extent, reject it. */ 2851 need = xfs_alloc_min_freelist(mp, pag); 2852 if (!xfs_alloc_space_available(args, need, alloc_flags)) 2853 goto out_agbp_relse; 2854 2855 #ifdef DEBUG 2856 if (args->alloc_minlen_only) { 2857 int stat; 2858 2859 error = xfs_exact_minlen_extent_available(args, agbp, &stat); 2860 if (error || !stat) 2861 goto out_agbp_relse; 2862 } 2863 #endif 2864 /* 2865 * Make the freelist shorter if it's too long. 2866 * 2867 * Note that from this point onwards, we will always release the agf and 2868 * agfl buffers on error. This handles the case where we error out and 2869 * the buffers are clean or may not have been joined to the transaction 2870 * and hence need to be released manually. If they have been joined to 2871 * the transaction, then xfs_trans_brelse() will handle them 2872 * appropriately based on the recursion count and dirty state of the 2873 * buffer. 2874 * 2875 * XXX (dgc): When we have lots of free space, does this buy us 2876 * anything other than extra overhead when we need to put more blocks 2877 * back on the free list? Maybe we should only do this when space is 2878 * getting low or the AGFL is more than half full? 2879 * 2880 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too 2881 * big; the NORMAP flag prevents AGFL expand/shrink operations from 2882 * updating the rmapbt. Both flags are used in xfs_repair while we're 2883 * rebuilding the rmapbt, and neither are used by the kernel. They're 2884 * both required to ensure that rmaps are correctly recorded for the 2885 * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and 2886 * repair/rmap.c in xfsprogs for details. 2887 */ 2888 memset(&targs, 0, sizeof(targs)); 2889 /* struct copy below */ 2890 if (alloc_flags & XFS_ALLOC_FLAG_NORMAP) 2891 targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; 2892 else 2893 targs.oinfo = XFS_RMAP_OINFO_AG; 2894 while (!(alloc_flags & XFS_ALLOC_FLAG_NOSHRINK) && 2895 pag->pagf_flcount > need) { 2896 error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0); 2897 if (error) 2898 goto out_agbp_relse; 2899 2900 /* defer agfl frees */ 2901 error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); 2902 if (error) 2903 goto out_agbp_relse; 2904 } 2905 2906 targs.tp = tp; 2907 targs.mp = mp; 2908 targs.agbp = agbp; 2909 targs.agno = args->agno; 2910 targs.alignment = targs.minlen = targs.prod = 1; 2911 targs.pag = pag; 2912 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 2913 if (error) 2914 goto out_agbp_relse; 2915 2916 /* Make the freelist longer if it's too short. */ 2917 while (pag->pagf_flcount < need) { 2918 targs.agbno = 0; 2919 targs.maxlen = need - pag->pagf_flcount; 2920 targs.resv = XFS_AG_RESV_AGFL; 2921 2922 /* Allocate as many blocks as possible at once. */ 2923 error = xfs_alloc_ag_vextent_size(&targs, alloc_flags); 2924 if (error) 2925 goto out_agflbp_relse; 2926 2927 /* 2928 * Stop if we run out. Won't happen if callers are obeying 2929 * the restrictions correctly. Can happen for free calls 2930 * on a completely full ag. 2931 */ 2932 if (targs.agbno == NULLAGBLOCK) { 2933 if (alloc_flags & XFS_ALLOC_FLAG_FREEING) 2934 break; 2935 goto out_agflbp_relse; 2936 } 2937 2938 if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) { 2939 error = xfs_rmap_alloc(tp, agbp, pag, 2940 targs.agbno, targs.len, &targs.oinfo); 2941 if (error) 2942 goto out_agflbp_relse; 2943 } 2944 error = xfs_alloc_update_counters(tp, agbp, 2945 -((long)(targs.len))); 2946 if (error) 2947 goto out_agflbp_relse; 2948 2949 /* 2950 * Put each allocated block on the list. 2951 */ 2952 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { 2953 error = xfs_alloc_put_freelist(pag, tp, agbp, 2954 agflbp, bno, 0); 2955 if (error) 2956 goto out_agflbp_relse; 2957 } 2958 } 2959 xfs_trans_brelse(tp, agflbp); 2960 args->agbp = agbp; 2961 return 0; 2962 2963 out_agflbp_relse: 2964 xfs_trans_brelse(tp, agflbp); 2965 out_agbp_relse: 2966 if (agbp) 2967 xfs_trans_brelse(tp, agbp); 2968 out_no_agbp: 2969 args->agbp = NULL; 2970 return error; 2971 } 2972 2973 /* 2974 * Get a block from the freelist. 2975 * Returns with the buffer for the block gotten. 2976 */ 2977 int 2978 xfs_alloc_get_freelist( 2979 struct xfs_perag *pag, 2980 struct xfs_trans *tp, 2981 struct xfs_buf *agbp, 2982 xfs_agblock_t *bnop, 2983 int btreeblk) 2984 { 2985 struct xfs_agf *agf = agbp->b_addr; 2986 struct xfs_buf *agflbp; 2987 xfs_agblock_t bno; 2988 __be32 *agfl_bno; 2989 int error; 2990 uint32_t logflags; 2991 struct xfs_mount *mp = tp->t_mountp; 2992 2993 /* 2994 * Freelist is empty, give up. 2995 */ 2996 if (!agf->agf_flcount) { 2997 *bnop = NULLAGBLOCK; 2998 return 0; 2999 } 3000 /* 3001 * Read the array of free blocks. 3002 */ 3003 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 3004 if (error) 3005 return error; 3006 3007 3008 /* 3009 * Get the block number and update the data structures. 3010 */ 3011 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 3012 bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); 3013 if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno))) 3014 return -EFSCORRUPTED; 3015 3016 be32_add_cpu(&agf->agf_flfirst, 1); 3017 xfs_trans_brelse(tp, agflbp); 3018 if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) 3019 agf->agf_flfirst = 0; 3020 3021 ASSERT(!xfs_perag_agfl_needs_reset(pag)); 3022 be32_add_cpu(&agf->agf_flcount, -1); 3023 pag->pagf_flcount--; 3024 3025 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; 3026 if (btreeblk) { 3027 be32_add_cpu(&agf->agf_btreeblks, 1); 3028 pag->pagf_btreeblks++; 3029 logflags |= XFS_AGF_BTREEBLKS; 3030 } 3031 3032 xfs_alloc_log_agf(tp, agbp, logflags); 3033 *bnop = bno; 3034 3035 return 0; 3036 } 3037 3038 /* 3039 * Log the given fields from the agf structure. 3040 */ 3041 void 3042 xfs_alloc_log_agf( 3043 struct xfs_trans *tp, 3044 struct xfs_buf *bp, 3045 uint32_t fields) 3046 { 3047 int first; /* first byte offset */ 3048 int last; /* last byte offset */ 3049 static const short offsets[] = { 3050 offsetof(xfs_agf_t, agf_magicnum), 3051 offsetof(xfs_agf_t, agf_versionnum), 3052 offsetof(xfs_agf_t, agf_seqno), 3053 offsetof(xfs_agf_t, agf_length), 3054 offsetof(xfs_agf_t, agf_bno_root), /* also cnt/rmap root */ 3055 offsetof(xfs_agf_t, agf_bno_level), /* also cnt/rmap levels */ 3056 offsetof(xfs_agf_t, agf_flfirst), 3057 offsetof(xfs_agf_t, agf_fllast), 3058 offsetof(xfs_agf_t, agf_flcount), 3059 offsetof(xfs_agf_t, agf_freeblks), 3060 offsetof(xfs_agf_t, agf_longest), 3061 offsetof(xfs_agf_t, agf_btreeblks), 3062 offsetof(xfs_agf_t, agf_uuid), 3063 offsetof(xfs_agf_t, agf_rmap_blocks), 3064 offsetof(xfs_agf_t, agf_refcount_blocks), 3065 offsetof(xfs_agf_t, agf_refcount_root), 3066 offsetof(xfs_agf_t, agf_refcount_level), 3067 /* needed so that we don't log the whole rest of the structure: */ 3068 offsetof(xfs_agf_t, agf_spare64), 3069 sizeof(xfs_agf_t) 3070 }; 3071 3072 trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_); 3073 3074 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); 3075 3076 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); 3077 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); 3078 } 3079 3080 /* 3081 * Put the block on the freelist for the allocation group. 3082 */ 3083 int 3084 xfs_alloc_put_freelist( 3085 struct xfs_perag *pag, 3086 struct xfs_trans *tp, 3087 struct xfs_buf *agbp, 3088 struct xfs_buf *agflbp, 3089 xfs_agblock_t bno, 3090 int btreeblk) 3091 { 3092 struct xfs_mount *mp = tp->t_mountp; 3093 struct xfs_agf *agf = agbp->b_addr; 3094 __be32 *blockp; 3095 int error; 3096 uint32_t logflags; 3097 __be32 *agfl_bno; 3098 int startoff; 3099 3100 if (!agflbp) { 3101 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 3102 if (error) 3103 return error; 3104 } 3105 3106 be32_add_cpu(&agf->agf_fllast, 1); 3107 if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) 3108 agf->agf_fllast = 0; 3109 3110 ASSERT(!xfs_perag_agfl_needs_reset(pag)); 3111 be32_add_cpu(&agf->agf_flcount, 1); 3112 pag->pagf_flcount++; 3113 3114 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; 3115 if (btreeblk) { 3116 be32_add_cpu(&agf->agf_btreeblks, -1); 3117 pag->pagf_btreeblks--; 3118 logflags |= XFS_AGF_BTREEBLKS; 3119 } 3120 3121 xfs_alloc_log_agf(tp, agbp, logflags); 3122 3123 ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); 3124 3125 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 3126 blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; 3127 *blockp = cpu_to_be32(bno); 3128 startoff = (char *)blockp - (char *)agflbp->b_addr; 3129 3130 xfs_alloc_log_agf(tp, agbp, logflags); 3131 3132 xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF); 3133 xfs_trans_log_buf(tp, agflbp, startoff, 3134 startoff + sizeof(xfs_agblock_t) - 1); 3135 return 0; 3136 } 3137 3138 /* 3139 * Check that this AGF/AGI header's sequence number and length matches the AG 3140 * number and size in fsblocks. 3141 */ 3142 xfs_failaddr_t 3143 xfs_validate_ag_length( 3144 struct xfs_buf *bp, 3145 uint32_t seqno, 3146 uint32_t length) 3147 { 3148 struct xfs_mount *mp = bp->b_mount; 3149 /* 3150 * During growfs operations, the perag is not fully initialised, 3151 * so we can't use it for any useful checking. growfs ensures we can't 3152 * use it by using uncached buffers that don't have the perag attached 3153 * so we can detect and avoid this problem. 3154 */ 3155 if (bp->b_pag && seqno != bp->b_pag->pag_agno) 3156 return __this_address; 3157 3158 /* 3159 * Only the last AG in the filesystem is allowed to be shorter 3160 * than the AG size recorded in the superblock. 3161 */ 3162 if (length != mp->m_sb.sb_agblocks) { 3163 /* 3164 * During growfs, the new last AG can get here before we 3165 * have updated the superblock. Give it a pass on the seqno 3166 * check. 3167 */ 3168 if (bp->b_pag && seqno != mp->m_sb.sb_agcount - 1) 3169 return __this_address; 3170 if (length < XFS_MIN_AG_BLOCKS) 3171 return __this_address; 3172 if (length > mp->m_sb.sb_agblocks) 3173 return __this_address; 3174 } 3175 3176 return NULL; 3177 } 3178 3179 /* 3180 * Verify the AGF is consistent. 3181 * 3182 * We do not verify the AGFL indexes in the AGF are fully consistent here 3183 * because of issues with variable on-disk structure sizes. Instead, we check 3184 * the agfl indexes for consistency when we initialise the perag from the AGF 3185 * information after a read completes. 3186 * 3187 * If the index is inconsistent, then we mark the perag as needing an AGFL 3188 * reset. The first AGFL update performed then resets the AGFL indexes and 3189 * refills the AGFL with known good free blocks, allowing the filesystem to 3190 * continue operating normally at the cost of a few leaked free space blocks. 3191 */ 3192 static xfs_failaddr_t 3193 xfs_agf_verify( 3194 struct xfs_buf *bp) 3195 { 3196 struct xfs_mount *mp = bp->b_mount; 3197 struct xfs_agf *agf = bp->b_addr; 3198 xfs_failaddr_t fa; 3199 uint32_t agf_seqno = be32_to_cpu(agf->agf_seqno); 3200 uint32_t agf_length = be32_to_cpu(agf->agf_length); 3201 3202 if (xfs_has_crc(mp)) { 3203 if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) 3204 return __this_address; 3205 if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn))) 3206 return __this_address; 3207 } 3208 3209 if (!xfs_verify_magic(bp, agf->agf_magicnum)) 3210 return __this_address; 3211 3212 if (!XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum))) 3213 return __this_address; 3214 3215 /* 3216 * Both agf_seqno and agf_length need to validated before anything else 3217 * block number related in the AGF or AGFL can be checked. 3218 */ 3219 fa = xfs_validate_ag_length(bp, agf_seqno, agf_length); 3220 if (fa) 3221 return fa; 3222 3223 if (be32_to_cpu(agf->agf_flfirst) >= xfs_agfl_size(mp)) 3224 return __this_address; 3225 if (be32_to_cpu(agf->agf_fllast) >= xfs_agfl_size(mp)) 3226 return __this_address; 3227 if (be32_to_cpu(agf->agf_flcount) > xfs_agfl_size(mp)) 3228 return __this_address; 3229 3230 if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) || 3231 be32_to_cpu(agf->agf_freeblks) > agf_length) 3232 return __this_address; 3233 3234 if (be32_to_cpu(agf->agf_bno_level) < 1 || 3235 be32_to_cpu(agf->agf_cnt_level) < 1 || 3236 be32_to_cpu(agf->agf_bno_level) > mp->m_alloc_maxlevels || 3237 be32_to_cpu(agf->agf_cnt_level) > mp->m_alloc_maxlevels) 3238 return __this_address; 3239 3240 if (xfs_has_lazysbcount(mp) && 3241 be32_to_cpu(agf->agf_btreeblks) > agf_length) 3242 return __this_address; 3243 3244 if (xfs_has_rmapbt(mp)) { 3245 if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length) 3246 return __this_address; 3247 3248 if (be32_to_cpu(agf->agf_rmap_level) < 1 || 3249 be32_to_cpu(agf->agf_rmap_level) > mp->m_rmap_maxlevels) 3250 return __this_address; 3251 } 3252 3253 if (xfs_has_reflink(mp)) { 3254 if (be32_to_cpu(agf->agf_refcount_blocks) > agf_length) 3255 return __this_address; 3256 3257 if (be32_to_cpu(agf->agf_refcount_level) < 1 || 3258 be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels) 3259 return __this_address; 3260 } 3261 3262 return NULL; 3263 } 3264 3265 static void 3266 xfs_agf_read_verify( 3267 struct xfs_buf *bp) 3268 { 3269 struct xfs_mount *mp = bp->b_mount; 3270 xfs_failaddr_t fa; 3271 3272 if (xfs_has_crc(mp) && 3273 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) 3274 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 3275 else { 3276 fa = xfs_agf_verify(bp); 3277 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF)) 3278 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 3279 } 3280 } 3281 3282 static void 3283 xfs_agf_write_verify( 3284 struct xfs_buf *bp) 3285 { 3286 struct xfs_mount *mp = bp->b_mount; 3287 struct xfs_buf_log_item *bip = bp->b_log_item; 3288 struct xfs_agf *agf = bp->b_addr; 3289 xfs_failaddr_t fa; 3290 3291 fa = xfs_agf_verify(bp); 3292 if (fa) { 3293 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 3294 return; 3295 } 3296 3297 if (!xfs_has_crc(mp)) 3298 return; 3299 3300 if (bip) 3301 agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); 3302 3303 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); 3304 } 3305 3306 const struct xfs_buf_ops xfs_agf_buf_ops = { 3307 .name = "xfs_agf", 3308 .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) }, 3309 .verify_read = xfs_agf_read_verify, 3310 .verify_write = xfs_agf_write_verify, 3311 .verify_struct = xfs_agf_verify, 3312 }; 3313 3314 /* 3315 * Read in the allocation group header (free/alloc section). 3316 */ 3317 int 3318 xfs_read_agf( 3319 struct xfs_perag *pag, 3320 struct xfs_trans *tp, 3321 int flags, 3322 struct xfs_buf **agfbpp) 3323 { 3324 struct xfs_mount *mp = pag->pag_mount; 3325 int error; 3326 3327 trace_xfs_read_agf(pag->pag_mount, pag->pag_agno); 3328 3329 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 3330 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)), 3331 XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); 3332 if (xfs_metadata_is_sick(error)) 3333 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); 3334 if (error) 3335 return error; 3336 3337 xfs_buf_set_ref(*agfbpp, XFS_AGF_REF); 3338 return 0; 3339 } 3340 3341 /* 3342 * Read in the allocation group header (free/alloc section) and initialise the 3343 * perag structure if necessary. If the caller provides @agfbpp, then return the 3344 * locked buffer to the caller, otherwise free it. 3345 */ 3346 int 3347 xfs_alloc_read_agf( 3348 struct xfs_perag *pag, 3349 struct xfs_trans *tp, 3350 int flags, 3351 struct xfs_buf **agfbpp) 3352 { 3353 struct xfs_buf *agfbp; 3354 struct xfs_agf *agf; 3355 int error; 3356 int allocbt_blks; 3357 3358 trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno); 3359 3360 /* We don't support trylock when freeing. */ 3361 ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != 3362 (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)); 3363 error = xfs_read_agf(pag, tp, 3364 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, 3365 &agfbp); 3366 if (error) 3367 return error; 3368 3369 agf = agfbp->b_addr; 3370 if (!xfs_perag_initialised_agf(pag)) { 3371 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); 3372 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); 3373 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); 3374 pag->pagf_longest = be32_to_cpu(agf->agf_longest); 3375 pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level); 3376 pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level); 3377 pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level); 3378 pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); 3379 if (xfs_agfl_needs_reset(pag->pag_mount, agf)) 3380 set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); 3381 else 3382 clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); 3383 3384 /* 3385 * Update the in-core allocbt counter. Filter out the rmapbt 3386 * subset of the btreeblks counter because the rmapbt is managed 3387 * by perag reservation. Subtract one for the rmapbt root block 3388 * because the rmap counter includes it while the btreeblks 3389 * counter only tracks non-root blocks. 3390 */ 3391 allocbt_blks = pag->pagf_btreeblks; 3392 if (xfs_has_rmapbt(pag->pag_mount)) 3393 allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; 3394 if (allocbt_blks > 0) 3395 atomic64_add(allocbt_blks, 3396 &pag->pag_mount->m_allocbt_blks); 3397 3398 set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); 3399 } 3400 #ifdef DEBUG 3401 else if (!xfs_is_shutdown(pag->pag_mount)) { 3402 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); 3403 ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); 3404 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); 3405 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); 3406 ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level)); 3407 ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level)); 3408 } 3409 #endif 3410 if (agfbpp) 3411 *agfbpp = agfbp; 3412 else 3413 xfs_trans_brelse(tp, agfbp); 3414 return 0; 3415 } 3416 3417 /* 3418 * Pre-proces allocation arguments to set initial state that we don't require 3419 * callers to set up correctly, as well as bounds check the allocation args 3420 * that are set up. 3421 */ 3422 static int 3423 xfs_alloc_vextent_check_args( 3424 struct xfs_alloc_arg *args, 3425 xfs_fsblock_t target, 3426 xfs_agnumber_t *minimum_agno) 3427 { 3428 struct xfs_mount *mp = args->mp; 3429 xfs_agblock_t agsize; 3430 3431 args->fsbno = NULLFSBLOCK; 3432 3433 *minimum_agno = 0; 3434 if (args->tp->t_highest_agno != NULLAGNUMBER) 3435 *minimum_agno = args->tp->t_highest_agno; 3436 3437 /* 3438 * Just fix this up, for the case where the last a.g. is shorter 3439 * (or there's only one a.g.) and the caller couldn't easily figure 3440 * that out (xfs_bmap_alloc). 3441 */ 3442 agsize = mp->m_sb.sb_agblocks; 3443 if (args->maxlen > agsize) 3444 args->maxlen = agsize; 3445 if (args->alignment == 0) 3446 args->alignment = 1; 3447 3448 ASSERT(args->minlen > 0); 3449 ASSERT(args->maxlen > 0); 3450 ASSERT(args->alignment > 0); 3451 ASSERT(args->resv != XFS_AG_RESV_AGFL); 3452 3453 ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount); 3454 ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize); 3455 ASSERT(args->minlen <= args->maxlen); 3456 ASSERT(args->minlen <= agsize); 3457 ASSERT(args->mod < args->prod); 3458 3459 if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount || 3460 XFS_FSB_TO_AGBNO(mp, target) >= agsize || 3461 args->minlen > args->maxlen || args->minlen > agsize || 3462 args->mod >= args->prod) { 3463 trace_xfs_alloc_vextent_badargs(args); 3464 return -ENOSPC; 3465 } 3466 3467 if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) { 3468 trace_xfs_alloc_vextent_skip_deadlock(args); 3469 return -ENOSPC; 3470 } 3471 return 0; 3472 3473 } 3474 3475 /* 3476 * Prepare an AG for allocation. If the AG is not prepared to accept the 3477 * allocation, return failure. 3478 * 3479 * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are 3480 * modified to hold their own perag references. 3481 */ 3482 static int 3483 xfs_alloc_vextent_prepare_ag( 3484 struct xfs_alloc_arg *args, 3485 uint32_t alloc_flags) 3486 { 3487 bool need_pag = !args->pag; 3488 int error; 3489 3490 if (need_pag) 3491 args->pag = xfs_perag_get(args->mp, args->agno); 3492 3493 args->agbp = NULL; 3494 error = xfs_alloc_fix_freelist(args, alloc_flags); 3495 if (error) { 3496 trace_xfs_alloc_vextent_nofix(args); 3497 if (need_pag) 3498 xfs_perag_put(args->pag); 3499 args->agbno = NULLAGBLOCK; 3500 return error; 3501 } 3502 if (!args->agbp) { 3503 /* cannot allocate in this AG at all */ 3504 trace_xfs_alloc_vextent_noagbp(args); 3505 args->agbno = NULLAGBLOCK; 3506 return 0; 3507 } 3508 args->wasfromfl = 0; 3509 return 0; 3510 } 3511 3512 /* 3513 * Post-process allocation results to account for the allocation if it succeed 3514 * and set the allocated block number correctly for the caller. 3515 * 3516 * XXX: we should really be returning ENOSPC for ENOSPC, not 3517 * hiding it behind a "successful" NULLFSBLOCK allocation. 3518 */ 3519 static int 3520 xfs_alloc_vextent_finish( 3521 struct xfs_alloc_arg *args, 3522 xfs_agnumber_t minimum_agno, 3523 int alloc_error, 3524 bool drop_perag) 3525 { 3526 struct xfs_mount *mp = args->mp; 3527 int error = 0; 3528 3529 /* 3530 * We can end up here with a locked AGF. If we failed, the caller is 3531 * likely going to try to allocate again with different parameters, and 3532 * that can widen the AGs that are searched for free space. If we have 3533 * to do BMBT block allocation, we have to do a new allocation. 3534 * 3535 * Hence leaving this function with the AGF locked opens up potential 3536 * ABBA AGF deadlocks because a future allocation attempt in this 3537 * transaction may attempt to lock a lower number AGF. 3538 * 3539 * We can't release the AGF until the transaction is commited, so at 3540 * this point we must update the "first allocation" tracker to point at 3541 * this AG if the tracker is empty or points to a lower AG. This allows 3542 * the next allocation attempt to be modified appropriately to avoid 3543 * deadlocks. 3544 */ 3545 if (args->agbp && 3546 (args->tp->t_highest_agno == NULLAGNUMBER || 3547 args->agno > minimum_agno)) 3548 args->tp->t_highest_agno = args->agno; 3549 3550 /* 3551 * If the allocation failed with an error or we had an ENOSPC result, 3552 * preserve the returned error whilst also marking the allocation result 3553 * as "no extent allocated". This ensures that callers that fail to 3554 * capture the error will still treat it as a failed allocation. 3555 */ 3556 if (alloc_error || args->agbno == NULLAGBLOCK) { 3557 args->fsbno = NULLFSBLOCK; 3558 error = alloc_error; 3559 goto out_drop_perag; 3560 } 3561 3562 args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); 3563 3564 ASSERT(args->len >= args->minlen); 3565 ASSERT(args->len <= args->maxlen); 3566 ASSERT(args->agbno % args->alignment == 0); 3567 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len); 3568 3569 /* if not file data, insert new block into the reverse map btree */ 3570 if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { 3571 error = xfs_rmap_alloc(args->tp, args->agbp, args->pag, 3572 args->agbno, args->len, &args->oinfo); 3573 if (error) 3574 goto out_drop_perag; 3575 } 3576 3577 if (!args->wasfromfl) { 3578 error = xfs_alloc_update_counters(args->tp, args->agbp, 3579 -((long)(args->len))); 3580 if (error) 3581 goto out_drop_perag; 3582 3583 ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno, 3584 args->len)); 3585 } 3586 3587 xfs_ag_resv_alloc_extent(args->pag, args->resv, args); 3588 3589 XFS_STATS_INC(mp, xs_allocx); 3590 XFS_STATS_ADD(mp, xs_allocb, args->len); 3591 3592 trace_xfs_alloc_vextent_finish(args); 3593 3594 out_drop_perag: 3595 if (drop_perag && args->pag) { 3596 xfs_perag_rele(args->pag); 3597 args->pag = NULL; 3598 } 3599 return error; 3600 } 3601 3602 /* 3603 * Allocate within a single AG only. This uses a best-fit length algorithm so if 3604 * you need an exact sized allocation without locality constraints, this is the 3605 * fastest way to do it. 3606 * 3607 * Caller is expected to hold a perag reference in args->pag. 3608 */ 3609 int 3610 xfs_alloc_vextent_this_ag( 3611 struct xfs_alloc_arg *args, 3612 xfs_agnumber_t agno) 3613 { 3614 struct xfs_mount *mp = args->mp; 3615 xfs_agnumber_t minimum_agno; 3616 uint32_t alloc_flags = 0; 3617 int error; 3618 3619 ASSERT(args->pag != NULL); 3620 ASSERT(args->pag->pag_agno == agno); 3621 3622 args->agno = agno; 3623 args->agbno = 0; 3624 3625 trace_xfs_alloc_vextent_this_ag(args); 3626 3627 error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0), 3628 &minimum_agno); 3629 if (error) { 3630 if (error == -ENOSPC) 3631 return 0; 3632 return error; 3633 } 3634 3635 error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); 3636 if (!error && args->agbp) 3637 error = xfs_alloc_ag_vextent_size(args, alloc_flags); 3638 3639 return xfs_alloc_vextent_finish(args, minimum_agno, error, false); 3640 } 3641 3642 /* 3643 * Iterate all AGs trying to allocate an extent starting from @start_ag. 3644 * 3645 * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the 3646 * allocation attempts in @start_agno have locality information. If we fail to 3647 * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs 3648 * we attempt to allocation in as there is no locality optimisation possible for 3649 * those allocations. 3650 * 3651 * On return, args->pag may be left referenced if we finish before the "all 3652 * failed" return point. The allocation finish still needs the perag, and 3653 * so the caller will release it once they've finished the allocation. 3654 * 3655 * When we wrap the AG iteration at the end of the filesystem, we have to be 3656 * careful not to wrap into AGs below ones we already have locked in the 3657 * transaction if we are doing a blocking iteration. This will result in an 3658 * out-of-order locking of AGFs and hence can cause deadlocks. 3659 */ 3660 static int 3661 xfs_alloc_vextent_iterate_ags( 3662 struct xfs_alloc_arg *args, 3663 xfs_agnumber_t minimum_agno, 3664 xfs_agnumber_t start_agno, 3665 xfs_agblock_t target_agbno, 3666 uint32_t alloc_flags) 3667 { 3668 struct xfs_mount *mp = args->mp; 3669 xfs_agnumber_t restart_agno = minimum_agno; 3670 xfs_agnumber_t agno; 3671 int error = 0; 3672 3673 if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) 3674 restart_agno = 0; 3675 restart: 3676 for_each_perag_wrap_range(mp, start_agno, restart_agno, 3677 mp->m_sb.sb_agcount, agno, args->pag) { 3678 args->agno = agno; 3679 error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); 3680 if (error) 3681 break; 3682 if (!args->agbp) { 3683 trace_xfs_alloc_vextent_loopfailed(args); 3684 continue; 3685 } 3686 3687 /* 3688 * Allocation is supposed to succeed now, so break out of the 3689 * loop regardless of whether we succeed or not. 3690 */ 3691 if (args->agno == start_agno && target_agbno) { 3692 args->agbno = target_agbno; 3693 error = xfs_alloc_ag_vextent_near(args, alloc_flags); 3694 } else { 3695 args->agbno = 0; 3696 error = xfs_alloc_ag_vextent_size(args, alloc_flags); 3697 } 3698 break; 3699 } 3700 if (error) { 3701 xfs_perag_rele(args->pag); 3702 args->pag = NULL; 3703 return error; 3704 } 3705 if (args->agbp) 3706 return 0; 3707 3708 /* 3709 * We didn't find an AG we can alloation from. If we were given 3710 * constraining flags by the caller, drop them and retry the allocation 3711 * without any constraints being set. 3712 */ 3713 if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) { 3714 alloc_flags &= ~XFS_ALLOC_FLAG_TRYLOCK; 3715 restart_agno = minimum_agno; 3716 goto restart; 3717 } 3718 3719 ASSERT(args->pag == NULL); 3720 trace_xfs_alloc_vextent_allfailed(args); 3721 return 0; 3722 } 3723 3724 /* 3725 * Iterate from the AGs from the start AG to the end of the filesystem, trying 3726 * to allocate blocks. It starts with a near allocation attempt in the initial 3727 * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap 3728 * back to zero if allowed by previous allocations in this transaction, 3729 * otherwise will wrap back to the start AG and run a second blocking pass to 3730 * the end of the filesystem. 3731 */ 3732 int 3733 xfs_alloc_vextent_start_ag( 3734 struct xfs_alloc_arg *args, 3735 xfs_fsblock_t target) 3736 { 3737 struct xfs_mount *mp = args->mp; 3738 xfs_agnumber_t minimum_agno; 3739 xfs_agnumber_t start_agno; 3740 xfs_agnumber_t rotorstep = xfs_rotorstep; 3741 bool bump_rotor = false; 3742 uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK; 3743 int error; 3744 3745 ASSERT(args->pag == NULL); 3746 3747 args->agno = NULLAGNUMBER; 3748 args->agbno = NULLAGBLOCK; 3749 3750 trace_xfs_alloc_vextent_start_ag(args); 3751 3752 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3753 if (error) { 3754 if (error == -ENOSPC) 3755 return 0; 3756 return error; 3757 } 3758 3759 if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && 3760 xfs_is_inode32(mp)) { 3761 target = XFS_AGB_TO_FSB(mp, 3762 ((mp->m_agfrotor / rotorstep) % 3763 mp->m_sb.sb_agcount), 0); 3764 bump_rotor = 1; 3765 } 3766 3767 start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); 3768 error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, 3769 XFS_FSB_TO_AGBNO(mp, target), alloc_flags); 3770 3771 if (bump_rotor) { 3772 if (args->agno == start_agno) 3773 mp->m_agfrotor = (mp->m_agfrotor + 1) % 3774 (mp->m_sb.sb_agcount * rotorstep); 3775 else 3776 mp->m_agfrotor = (args->agno * rotorstep + 1) % 3777 (mp->m_sb.sb_agcount * rotorstep); 3778 } 3779 3780 return xfs_alloc_vextent_finish(args, minimum_agno, error, true); 3781 } 3782 3783 /* 3784 * Iterate from the agno indicated via @target through to the end of the 3785 * filesystem attempting blocking allocation. This does not wrap or try a second 3786 * pass, so will not recurse into AGs lower than indicated by the target. 3787 */ 3788 int 3789 xfs_alloc_vextent_first_ag( 3790 struct xfs_alloc_arg *args, 3791 xfs_fsblock_t target) 3792 { 3793 struct xfs_mount *mp = args->mp; 3794 xfs_agnumber_t minimum_agno; 3795 xfs_agnumber_t start_agno; 3796 uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK; 3797 int error; 3798 3799 ASSERT(args->pag == NULL); 3800 3801 args->agno = NULLAGNUMBER; 3802 args->agbno = NULLAGBLOCK; 3803 3804 trace_xfs_alloc_vextent_first_ag(args); 3805 3806 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3807 if (error) { 3808 if (error == -ENOSPC) 3809 return 0; 3810 return error; 3811 } 3812 3813 start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); 3814 error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, 3815 XFS_FSB_TO_AGBNO(mp, target), alloc_flags); 3816 return xfs_alloc_vextent_finish(args, minimum_agno, error, true); 3817 } 3818 3819 /* 3820 * Allocate at the exact block target or fail. Caller is expected to hold a 3821 * perag reference in args->pag. 3822 */ 3823 int 3824 xfs_alloc_vextent_exact_bno( 3825 struct xfs_alloc_arg *args, 3826 xfs_fsblock_t target) 3827 { 3828 struct xfs_mount *mp = args->mp; 3829 xfs_agnumber_t minimum_agno; 3830 int error; 3831 3832 ASSERT(args->pag != NULL); 3833 ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target)); 3834 3835 args->agno = XFS_FSB_TO_AGNO(mp, target); 3836 args->agbno = XFS_FSB_TO_AGBNO(mp, target); 3837 3838 trace_xfs_alloc_vextent_exact_bno(args); 3839 3840 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3841 if (error) { 3842 if (error == -ENOSPC) 3843 return 0; 3844 return error; 3845 } 3846 3847 error = xfs_alloc_vextent_prepare_ag(args, 0); 3848 if (!error && args->agbp) 3849 error = xfs_alloc_ag_vextent_exact(args); 3850 3851 return xfs_alloc_vextent_finish(args, minimum_agno, error, false); 3852 } 3853 3854 /* 3855 * Allocate an extent as close to the target as possible. If there are not 3856 * viable candidates in the AG, then fail the allocation. 3857 * 3858 * Caller may or may not have a per-ag reference in args->pag. 3859 */ 3860 int 3861 xfs_alloc_vextent_near_bno( 3862 struct xfs_alloc_arg *args, 3863 xfs_fsblock_t target) 3864 { 3865 struct xfs_mount *mp = args->mp; 3866 xfs_agnumber_t minimum_agno; 3867 bool needs_perag = args->pag == NULL; 3868 uint32_t alloc_flags = 0; 3869 int error; 3870 3871 if (!needs_perag) 3872 ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target)); 3873 3874 args->agno = XFS_FSB_TO_AGNO(mp, target); 3875 args->agbno = XFS_FSB_TO_AGBNO(mp, target); 3876 3877 trace_xfs_alloc_vextent_near_bno(args); 3878 3879 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3880 if (error) { 3881 if (error == -ENOSPC) 3882 return 0; 3883 return error; 3884 } 3885 3886 if (needs_perag) 3887 args->pag = xfs_perag_grab(mp, args->agno); 3888 3889 error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); 3890 if (!error && args->agbp) 3891 error = xfs_alloc_ag_vextent_near(args, alloc_flags); 3892 3893 return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag); 3894 } 3895 3896 /* Ensure that the freelist is at full capacity. */ 3897 int 3898 xfs_free_extent_fix_freelist( 3899 struct xfs_trans *tp, 3900 struct xfs_perag *pag, 3901 struct xfs_buf **agbp) 3902 { 3903 struct xfs_alloc_arg args; 3904 int error; 3905 3906 memset(&args, 0, sizeof(struct xfs_alloc_arg)); 3907 args.tp = tp; 3908 args.mp = tp->t_mountp; 3909 args.agno = pag->pag_agno; 3910 args.pag = pag; 3911 3912 /* 3913 * validate that the block number is legal - the enables us to detect 3914 * and handle a silent filesystem corruption rather than crashing. 3915 */ 3916 if (args.agno >= args.mp->m_sb.sb_agcount) 3917 return -EFSCORRUPTED; 3918 3919 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); 3920 if (error) 3921 return error; 3922 3923 *agbp = args.agbp; 3924 return 0; 3925 } 3926 3927 /* 3928 * Free an extent. 3929 * Just break up the extent address and hand off to xfs_free_ag_extent 3930 * after fixing up the freelist. 3931 */ 3932 int 3933 __xfs_free_extent( 3934 struct xfs_trans *tp, 3935 struct xfs_perag *pag, 3936 xfs_agblock_t agbno, 3937 xfs_extlen_t len, 3938 const struct xfs_owner_info *oinfo, 3939 enum xfs_ag_resv_type type, 3940 bool skip_discard) 3941 { 3942 struct xfs_mount *mp = tp->t_mountp; 3943 struct xfs_buf *agbp; 3944 struct xfs_agf *agf; 3945 int error; 3946 unsigned int busy_flags = 0; 3947 3948 ASSERT(len != 0); 3949 ASSERT(type != XFS_AG_RESV_AGFL); 3950 3951 if (XFS_TEST_ERROR(false, mp, 3952 XFS_ERRTAG_FREE_EXTENT)) 3953 return -EIO; 3954 3955 error = xfs_free_extent_fix_freelist(tp, pag, &agbp); 3956 if (error) { 3957 if (xfs_metadata_is_sick(error)) 3958 xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); 3959 return error; 3960 } 3961 3962 agf = agbp->b_addr; 3963 3964 if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { 3965 xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); 3966 error = -EFSCORRUPTED; 3967 goto err_release; 3968 } 3969 3970 /* validate the extent size is legal now we have the agf locked */ 3971 if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { 3972 xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); 3973 error = -EFSCORRUPTED; 3974 goto err_release; 3975 } 3976 3977 error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo, 3978 type); 3979 if (error) 3980 goto err_release; 3981 3982 if (skip_discard) 3983 busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; 3984 xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags); 3985 return 0; 3986 3987 err_release: 3988 xfs_trans_brelse(tp, agbp); 3989 return error; 3990 } 3991 3992 struct xfs_alloc_query_range_info { 3993 xfs_alloc_query_range_fn fn; 3994 void *priv; 3995 }; 3996 3997 /* Format btree record and pass to our callback. */ 3998 STATIC int 3999 xfs_alloc_query_range_helper( 4000 struct xfs_btree_cur *cur, 4001 const union xfs_btree_rec *rec, 4002 void *priv) 4003 { 4004 struct xfs_alloc_query_range_info *query = priv; 4005 struct xfs_alloc_rec_incore irec; 4006 xfs_failaddr_t fa; 4007 4008 xfs_alloc_btrec_to_irec(rec, &irec); 4009 fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); 4010 if (fa) 4011 return xfs_alloc_complain_bad_rec(cur, fa, &irec); 4012 4013 return query->fn(cur, &irec, query->priv); 4014 } 4015 4016 /* Find all free space within a given range of blocks. */ 4017 int 4018 xfs_alloc_query_range( 4019 struct xfs_btree_cur *cur, 4020 const struct xfs_alloc_rec_incore *low_rec, 4021 const struct xfs_alloc_rec_incore *high_rec, 4022 xfs_alloc_query_range_fn fn, 4023 void *priv) 4024 { 4025 union xfs_btree_irec low_brec = { .a = *low_rec }; 4026 union xfs_btree_irec high_brec = { .a = *high_rec }; 4027 struct xfs_alloc_query_range_info query = { .priv = priv, .fn = fn }; 4028 4029 ASSERT(xfs_btree_is_bno(cur->bc_ops)); 4030 return xfs_btree_query_range(cur, &low_brec, &high_brec, 4031 xfs_alloc_query_range_helper, &query); 4032 } 4033 4034 /* Find all free space records. */ 4035 int 4036 xfs_alloc_query_all( 4037 struct xfs_btree_cur *cur, 4038 xfs_alloc_query_range_fn fn, 4039 void *priv) 4040 { 4041 struct xfs_alloc_query_range_info query; 4042 4043 ASSERT(xfs_btree_is_bno(cur->bc_ops)); 4044 query.priv = priv; 4045 query.fn = fn; 4046 return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); 4047 } 4048 4049 /* 4050 * Scan part of the keyspace of the free space and tell us if the area has no 4051 * records, is fully mapped by records, or is partially filled. 4052 */ 4053 int 4054 xfs_alloc_has_records( 4055 struct xfs_btree_cur *cur, 4056 xfs_agblock_t bno, 4057 xfs_extlen_t len, 4058 enum xbtree_recpacking *outcome) 4059 { 4060 union xfs_btree_irec low; 4061 union xfs_btree_irec high; 4062 4063 memset(&low, 0, sizeof(low)); 4064 low.a.ar_startblock = bno; 4065 memset(&high, 0xFF, sizeof(high)); 4066 high.a.ar_startblock = bno + len - 1; 4067 4068 return xfs_btree_has_records(cur, &low, &high, NULL, outcome); 4069 } 4070 4071 /* 4072 * Walk all the blocks in the AGFL. The @walk_fn can return any negative 4073 * error code or XFS_ITER_*. 4074 */ 4075 int 4076 xfs_agfl_walk( 4077 struct xfs_mount *mp, 4078 struct xfs_agf *agf, 4079 struct xfs_buf *agflbp, 4080 xfs_agfl_walk_fn walk_fn, 4081 void *priv) 4082 { 4083 __be32 *agfl_bno; 4084 unsigned int i; 4085 int error; 4086 4087 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 4088 i = be32_to_cpu(agf->agf_flfirst); 4089 4090 /* Nothing to walk in an empty AGFL. */ 4091 if (agf->agf_flcount == cpu_to_be32(0)) 4092 return 0; 4093 4094 /* Otherwise, walk from first to last, wrapping as needed. */ 4095 for (;;) { 4096 error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv); 4097 if (error) 4098 return error; 4099 if (i == be32_to_cpu(agf->agf_fllast)) 4100 break; 4101 if (++i == xfs_agfl_size(mp)) 4102 i = 0; 4103 } 4104 4105 return 0; 4106 } 4107 4108 int __init 4109 xfs_extfree_intent_init_cache(void) 4110 { 4111 xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent", 4112 sizeof(struct xfs_extent_free_item), 4113 0, 0, NULL); 4114 4115 return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM; 4116 } 4117 4118 void 4119 xfs_extfree_intent_destroy_cache(void) 4120 { 4121 kmem_cache_destroy(xfs_extfree_item_cache); 4122 xfs_extfree_item_cache = NULL; 4123 } 4124