1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc. 4 * All Rights Reserved. 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_format.h" 9 #include "xfs_log_format.h" 10 #include "xfs_shared.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_bit.h" 13 #include "xfs_mount.h" 14 #include "xfs_defer.h" 15 #include "xfs_btree.h" 16 #include "xfs_rmap.h" 17 #include "xfs_alloc_btree.h" 18 #include "xfs_alloc.h" 19 #include "xfs_extent_busy.h" 20 #include "xfs_errortag.h" 21 #include "xfs_error.h" 22 #include "xfs_trace.h" 23 #include "xfs_trans.h" 24 #include "xfs_buf_item.h" 25 #include "xfs_log.h" 26 #include "xfs_ag.h" 27 #include "xfs_ag_resv.h" 28 #include "xfs_bmap.h" 29 #include "xfs_health.h" 30 31 struct kmem_cache *xfs_extfree_item_cache; 32 33 struct workqueue_struct *xfs_alloc_wq; 34 35 #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) 36 37 #define XFSA_FIXUP_BNO_OK 1 38 #define XFSA_FIXUP_CNT_OK 2 39 40 /* 41 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in 42 * the beginning of the block for a proper header with the location information 43 * and CRC. 44 */ 45 unsigned int 46 xfs_agfl_size( 47 struct xfs_mount *mp) 48 { 49 unsigned int size = mp->m_sb.sb_sectsize; 50 51 if (xfs_has_crc(mp)) 52 size -= sizeof(struct xfs_agfl); 53 54 return size / sizeof(xfs_agblock_t); 55 } 56 57 unsigned int 58 xfs_refc_block( 59 struct xfs_mount *mp) 60 { 61 if (xfs_has_rmapbt(mp)) 62 return XFS_RMAP_BLOCK(mp) + 1; 63 if (xfs_has_finobt(mp)) 64 return XFS_FIBT_BLOCK(mp) + 1; 65 return XFS_IBT_BLOCK(mp) + 1; 66 } 67 68 xfs_extlen_t 69 xfs_prealloc_blocks( 70 struct xfs_mount *mp) 71 { 72 if (xfs_has_reflink(mp)) 73 return xfs_refc_block(mp) + 1; 74 if (xfs_has_rmapbt(mp)) 75 return XFS_RMAP_BLOCK(mp) + 1; 76 if (xfs_has_finobt(mp)) 77 return XFS_FIBT_BLOCK(mp) + 1; 78 return XFS_IBT_BLOCK(mp) + 1; 79 } 80 81 /* 82 * The number of blocks per AG that we withhold from xfs_dec_fdblocks to 83 * guarantee that we can refill the AGFL prior to allocating space in a nearly 84 * full AG. Although the space described by the free space btrees, the 85 * blocks used by the freesp btrees themselves, and the blocks owned by the 86 * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk 87 * free space in the AG drop so low that the free space btrees cannot refill an 88 * empty AGFL up to the minimum level. Rather than grind through empty AGs 89 * until the fs goes down, we subtract this many AG blocks from the incore 90 * fdblocks to ensure user allocation does not overcommit the space the 91 * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to 92 * withhold space from xfs_dec_fdblocks, so we do not account for that here. 93 */ 94 #define XFS_ALLOCBT_AGFL_RESERVE 4 95 96 /* 97 * Compute the number of blocks that we set aside to guarantee the ability to 98 * refill the AGFL and handle a full bmap btree split. 99 * 100 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of 101 * AGF buffer (PV 947395), we place constraints on the relationship among 102 * actual allocations for data blocks, freelist blocks, and potential file data 103 * bmap btree blocks. However, these restrictions may result in no actual space 104 * allocated for a delayed extent, for example, a data block in a certain AG is 105 * allocated but there is no additional block for the additional bmap btree 106 * block due to a split of the bmap btree of the file. The result of this may 107 * lead to an infinite loop when the file gets flushed to disk and all delayed 108 * extents need to be actually allocated. To get around this, we explicitly set 109 * aside a few blocks which will not be reserved in delayed allocation. 110 * 111 * For each AG, we need to reserve enough blocks to replenish a totally empty 112 * AGFL and 4 more to handle a potential split of the file's bmap btree. 113 */ 114 unsigned int 115 xfs_alloc_set_aside( 116 struct xfs_mount *mp) 117 { 118 return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4); 119 } 120 121 /* 122 * When deciding how much space to allocate out of an AG, we limit the 123 * allocation maximum size to the size the AG. However, we cannot use all the 124 * blocks in the AG - some are permanently used by metadata. These 125 * blocks are generally: 126 * - the AG superblock, AGF, AGI and AGFL 127 * - the AGF (bno and cnt) and AGI btree root blocks, and optionally 128 * the AGI free inode and rmap btree root blocks. 129 * - blocks on the AGFL according to xfs_alloc_set_aside() limits 130 * - the rmapbt root block 131 * 132 * The AG headers are sector sized, so the amount of space they take up is 133 * dependent on filesystem geometry. The others are all single blocks. 134 */ 135 unsigned int 136 xfs_alloc_ag_max_usable( 137 struct xfs_mount *mp) 138 { 139 unsigned int blocks; 140 141 blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */ 142 blocks += XFS_ALLOCBT_AGFL_RESERVE; 143 blocks += 3; /* AGF, AGI btree root blocks */ 144 if (xfs_has_finobt(mp)) 145 blocks++; /* finobt root block */ 146 if (xfs_has_rmapbt(mp)) 147 blocks++; /* rmap root block */ 148 if (xfs_has_reflink(mp)) 149 blocks++; /* refcount root block */ 150 151 return mp->m_sb.sb_agblocks - blocks; 152 } 153 154 155 static int 156 xfs_alloc_lookup( 157 struct xfs_btree_cur *cur, 158 xfs_lookup_t dir, 159 xfs_agblock_t bno, 160 xfs_extlen_t len, 161 int *stat) 162 { 163 int error; 164 165 cur->bc_rec.a.ar_startblock = bno; 166 cur->bc_rec.a.ar_blockcount = len; 167 error = xfs_btree_lookup(cur, dir, stat); 168 if (*stat == 1) 169 cur->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE; 170 else 171 cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; 172 return error; 173 } 174 175 /* 176 * Lookup the record equal to [bno, len] in the btree given by cur. 177 */ 178 static inline int /* error */ 179 xfs_alloc_lookup_eq( 180 struct xfs_btree_cur *cur, /* btree cursor */ 181 xfs_agblock_t bno, /* starting block of extent */ 182 xfs_extlen_t len, /* length of extent */ 183 int *stat) /* success/failure */ 184 { 185 return xfs_alloc_lookup(cur, XFS_LOOKUP_EQ, bno, len, stat); 186 } 187 188 /* 189 * Lookup the first record greater than or equal to [bno, len] 190 * in the btree given by cur. 191 */ 192 int /* error */ 193 xfs_alloc_lookup_ge( 194 struct xfs_btree_cur *cur, /* btree cursor */ 195 xfs_agblock_t bno, /* starting block of extent */ 196 xfs_extlen_t len, /* length of extent */ 197 int *stat) /* success/failure */ 198 { 199 return xfs_alloc_lookup(cur, XFS_LOOKUP_GE, bno, len, stat); 200 } 201 202 /* 203 * Lookup the first record less than or equal to [bno, len] 204 * in the btree given by cur. 205 */ 206 int /* error */ 207 xfs_alloc_lookup_le( 208 struct xfs_btree_cur *cur, /* btree cursor */ 209 xfs_agblock_t bno, /* starting block of extent */ 210 xfs_extlen_t len, /* length of extent */ 211 int *stat) /* success/failure */ 212 { 213 return xfs_alloc_lookup(cur, XFS_LOOKUP_LE, bno, len, stat); 214 } 215 216 static inline bool 217 xfs_alloc_cur_active( 218 struct xfs_btree_cur *cur) 219 { 220 return cur && (cur->bc_flags & XFS_BTREE_ALLOCBT_ACTIVE); 221 } 222 223 /* 224 * Update the record referred to by cur to the value given 225 * by [bno, len]. 226 * This either works (return 0) or gets an EFSCORRUPTED error. 227 */ 228 STATIC int /* error */ 229 xfs_alloc_update( 230 struct xfs_btree_cur *cur, /* btree cursor */ 231 xfs_agblock_t bno, /* starting block of extent */ 232 xfs_extlen_t len) /* length of extent */ 233 { 234 union xfs_btree_rec rec; 235 236 rec.alloc.ar_startblock = cpu_to_be32(bno); 237 rec.alloc.ar_blockcount = cpu_to_be32(len); 238 return xfs_btree_update(cur, &rec); 239 } 240 241 /* Convert the ondisk btree record to its incore representation. */ 242 void 243 xfs_alloc_btrec_to_irec( 244 const union xfs_btree_rec *rec, 245 struct xfs_alloc_rec_incore *irec) 246 { 247 irec->ar_startblock = be32_to_cpu(rec->alloc.ar_startblock); 248 irec->ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount); 249 } 250 251 /* Simple checks for free space records. */ 252 xfs_failaddr_t 253 xfs_alloc_check_irec( 254 struct xfs_perag *pag, 255 const struct xfs_alloc_rec_incore *irec) 256 { 257 if (irec->ar_blockcount == 0) 258 return __this_address; 259 260 /* check for valid extent range, including overflow */ 261 if (!xfs_verify_agbext(pag, irec->ar_startblock, irec->ar_blockcount)) 262 return __this_address; 263 264 return NULL; 265 } 266 267 static inline int 268 xfs_alloc_complain_bad_rec( 269 struct xfs_btree_cur *cur, 270 xfs_failaddr_t fa, 271 const struct xfs_alloc_rec_incore *irec) 272 { 273 struct xfs_mount *mp = cur->bc_mp; 274 275 xfs_warn(mp, 276 "%sbt record corruption in AG %d detected at %pS!", 277 cur->bc_ops->name, cur->bc_ag.pag->pag_agno, fa); 278 xfs_warn(mp, 279 "start block 0x%x block count 0x%x", irec->ar_startblock, 280 irec->ar_blockcount); 281 xfs_btree_mark_sick(cur); 282 return -EFSCORRUPTED; 283 } 284 285 /* 286 * Get the data from the pointed-to record. 287 */ 288 int /* error */ 289 xfs_alloc_get_rec( 290 struct xfs_btree_cur *cur, /* btree cursor */ 291 xfs_agblock_t *bno, /* output: starting block of extent */ 292 xfs_extlen_t *len, /* output: length of extent */ 293 int *stat) /* output: success/failure */ 294 { 295 struct xfs_alloc_rec_incore irec; 296 union xfs_btree_rec *rec; 297 xfs_failaddr_t fa; 298 int error; 299 300 error = xfs_btree_get_rec(cur, &rec, stat); 301 if (error || !(*stat)) 302 return error; 303 304 xfs_alloc_btrec_to_irec(rec, &irec); 305 fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); 306 if (fa) 307 return xfs_alloc_complain_bad_rec(cur, fa, &irec); 308 309 *bno = irec.ar_startblock; 310 *len = irec.ar_blockcount; 311 return 0; 312 } 313 314 /* 315 * Compute aligned version of the found extent. 316 * Takes alignment and min length into account. 317 */ 318 STATIC bool 319 xfs_alloc_compute_aligned( 320 xfs_alloc_arg_t *args, /* allocation argument structure */ 321 xfs_agblock_t foundbno, /* starting block in found extent */ 322 xfs_extlen_t foundlen, /* length in found extent */ 323 xfs_agblock_t *resbno, /* result block number */ 324 xfs_extlen_t *reslen, /* result length */ 325 unsigned *busy_gen) 326 { 327 xfs_agblock_t bno = foundbno; 328 xfs_extlen_t len = foundlen; 329 xfs_extlen_t diff; 330 bool busy; 331 332 /* Trim busy sections out of found extent */ 333 busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen); 334 335 /* 336 * If we have a largish extent that happens to start before min_agbno, 337 * see if we can shift it into range... 338 */ 339 if (bno < args->min_agbno && bno + len > args->min_agbno) { 340 diff = args->min_agbno - bno; 341 if (len > diff) { 342 bno += diff; 343 len -= diff; 344 } 345 } 346 347 if (args->alignment > 1 && len >= args->minlen) { 348 xfs_agblock_t aligned_bno = roundup(bno, args->alignment); 349 350 diff = aligned_bno - bno; 351 352 *resbno = aligned_bno; 353 *reslen = diff >= len ? 0 : len - diff; 354 } else { 355 *resbno = bno; 356 *reslen = len; 357 } 358 359 return busy; 360 } 361 362 /* 363 * Compute best start block and diff for "near" allocations. 364 * freelen >= wantlen already checked by caller. 365 */ 366 STATIC xfs_extlen_t /* difference value (absolute) */ 367 xfs_alloc_compute_diff( 368 xfs_agblock_t wantbno, /* target starting block */ 369 xfs_extlen_t wantlen, /* target length */ 370 xfs_extlen_t alignment, /* target alignment */ 371 int datatype, /* are we allocating data? */ 372 xfs_agblock_t freebno, /* freespace's starting block */ 373 xfs_extlen_t freelen, /* freespace's length */ 374 xfs_agblock_t *newbnop) /* result: best start block from free */ 375 { 376 xfs_agblock_t freeend; /* end of freespace extent */ 377 xfs_agblock_t newbno1; /* return block number */ 378 xfs_agblock_t newbno2; /* other new block number */ 379 xfs_extlen_t newlen1=0; /* length with newbno1 */ 380 xfs_extlen_t newlen2=0; /* length with newbno2 */ 381 xfs_agblock_t wantend; /* end of target extent */ 382 bool userdata = datatype & XFS_ALLOC_USERDATA; 383 384 ASSERT(freelen >= wantlen); 385 freeend = freebno + freelen; 386 wantend = wantbno + wantlen; 387 /* 388 * We want to allocate from the start of a free extent if it is past 389 * the desired block or if we are allocating user data and the free 390 * extent is before desired block. The second case is there to allow 391 * for contiguous allocation from the remaining free space if the file 392 * grows in the short term. 393 */ 394 if (freebno >= wantbno || (userdata && freeend < wantend)) { 395 if ((newbno1 = roundup(freebno, alignment)) >= freeend) 396 newbno1 = NULLAGBLOCK; 397 } else if (freeend >= wantend && alignment > 1) { 398 newbno1 = roundup(wantbno, alignment); 399 newbno2 = newbno1 - alignment; 400 if (newbno1 >= freeend) 401 newbno1 = NULLAGBLOCK; 402 else 403 newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1); 404 if (newbno2 < freebno) 405 newbno2 = NULLAGBLOCK; 406 else 407 newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2); 408 if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { 409 if (newlen1 < newlen2 || 410 (newlen1 == newlen2 && 411 XFS_ABSDIFF(newbno1, wantbno) > 412 XFS_ABSDIFF(newbno2, wantbno))) 413 newbno1 = newbno2; 414 } else if (newbno2 != NULLAGBLOCK) 415 newbno1 = newbno2; 416 } else if (freeend >= wantend) { 417 newbno1 = wantbno; 418 } else if (alignment > 1) { 419 newbno1 = roundup(freeend - wantlen, alignment); 420 if (newbno1 > freeend - wantlen && 421 newbno1 - alignment >= freebno) 422 newbno1 -= alignment; 423 else if (newbno1 >= freeend) 424 newbno1 = NULLAGBLOCK; 425 } else 426 newbno1 = freeend - wantlen; 427 *newbnop = newbno1; 428 return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno); 429 } 430 431 /* 432 * Fix up the length, based on mod and prod. 433 * len should be k * prod + mod for some k. 434 * If len is too small it is returned unchanged. 435 * If len hits maxlen it is left alone. 436 */ 437 STATIC void 438 xfs_alloc_fix_len( 439 xfs_alloc_arg_t *args) /* allocation argument structure */ 440 { 441 xfs_extlen_t k; 442 xfs_extlen_t rlen; 443 444 ASSERT(args->mod < args->prod); 445 rlen = args->len; 446 ASSERT(rlen >= args->minlen); 447 ASSERT(rlen <= args->maxlen); 448 if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen || 449 (args->mod == 0 && rlen < args->prod)) 450 return; 451 k = rlen % args->prod; 452 if (k == args->mod) 453 return; 454 if (k > args->mod) 455 rlen = rlen - (k - args->mod); 456 else 457 rlen = rlen - args->prod + (args->mod - k); 458 /* casts to (int) catch length underflows */ 459 if ((int)rlen < (int)args->minlen) 460 return; 461 ASSERT(rlen >= args->minlen && rlen <= args->maxlen); 462 ASSERT(rlen % args->prod == args->mod); 463 ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >= 464 rlen + args->minleft); 465 args->len = rlen; 466 } 467 468 /* 469 * Update the two btrees, logically removing from freespace the extent 470 * starting at rbno, rlen blocks. The extent is contained within the 471 * actual (current) free extent fbno for flen blocks. 472 * Flags are passed in indicating whether the cursors are set to the 473 * relevant records. 474 */ 475 STATIC int /* error code */ 476 xfs_alloc_fixup_trees( 477 struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */ 478 struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */ 479 xfs_agblock_t fbno, /* starting block of free extent */ 480 xfs_extlen_t flen, /* length of free extent */ 481 xfs_agblock_t rbno, /* starting block of returned extent */ 482 xfs_extlen_t rlen, /* length of returned extent */ 483 int flags) /* flags, XFSA_FIXUP_... */ 484 { 485 int error; /* error code */ 486 int i; /* operation results */ 487 xfs_agblock_t nfbno1; /* first new free startblock */ 488 xfs_agblock_t nfbno2; /* second new free startblock */ 489 xfs_extlen_t nflen1=0; /* first new free length */ 490 xfs_extlen_t nflen2=0; /* second new free length */ 491 struct xfs_mount *mp; 492 493 mp = cnt_cur->bc_mp; 494 495 /* 496 * Look up the record in the by-size tree if necessary. 497 */ 498 if (flags & XFSA_FIXUP_CNT_OK) { 499 #ifdef DEBUG 500 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i))) 501 return error; 502 if (XFS_IS_CORRUPT(mp, 503 i != 1 || 504 nfbno1 != fbno || 505 nflen1 != flen)) { 506 xfs_btree_mark_sick(cnt_cur); 507 return -EFSCORRUPTED; 508 } 509 #endif 510 } else { 511 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i))) 512 return error; 513 if (XFS_IS_CORRUPT(mp, i != 1)) { 514 xfs_btree_mark_sick(cnt_cur); 515 return -EFSCORRUPTED; 516 } 517 } 518 /* 519 * Look up the record in the by-block tree if necessary. 520 */ 521 if (flags & XFSA_FIXUP_BNO_OK) { 522 #ifdef DEBUG 523 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i))) 524 return error; 525 if (XFS_IS_CORRUPT(mp, 526 i != 1 || 527 nfbno1 != fbno || 528 nflen1 != flen)) { 529 xfs_btree_mark_sick(bno_cur); 530 return -EFSCORRUPTED; 531 } 532 #endif 533 } else { 534 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i))) 535 return error; 536 if (XFS_IS_CORRUPT(mp, i != 1)) { 537 xfs_btree_mark_sick(bno_cur); 538 return -EFSCORRUPTED; 539 } 540 } 541 542 #ifdef DEBUG 543 if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) { 544 struct xfs_btree_block *bnoblock; 545 struct xfs_btree_block *cntblock; 546 547 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp); 548 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp); 549 550 if (XFS_IS_CORRUPT(mp, 551 bnoblock->bb_numrecs != 552 cntblock->bb_numrecs)) { 553 xfs_btree_mark_sick(bno_cur); 554 return -EFSCORRUPTED; 555 } 556 } 557 #endif 558 559 /* 560 * Deal with all four cases: the allocated record is contained 561 * within the freespace record, so we can have new freespace 562 * at either (or both) end, or no freespace remaining. 563 */ 564 if (rbno == fbno && rlen == flen) 565 nfbno1 = nfbno2 = NULLAGBLOCK; 566 else if (rbno == fbno) { 567 nfbno1 = rbno + rlen; 568 nflen1 = flen - rlen; 569 nfbno2 = NULLAGBLOCK; 570 } else if (rbno + rlen == fbno + flen) { 571 nfbno1 = fbno; 572 nflen1 = flen - rlen; 573 nfbno2 = NULLAGBLOCK; 574 } else { 575 nfbno1 = fbno; 576 nflen1 = rbno - fbno; 577 nfbno2 = rbno + rlen; 578 nflen2 = (fbno + flen) - nfbno2; 579 } 580 /* 581 * Delete the entry from the by-size btree. 582 */ 583 if ((error = xfs_btree_delete(cnt_cur, &i))) 584 return error; 585 if (XFS_IS_CORRUPT(mp, i != 1)) { 586 xfs_btree_mark_sick(cnt_cur); 587 return -EFSCORRUPTED; 588 } 589 /* 590 * Add new by-size btree entry(s). 591 */ 592 if (nfbno1 != NULLAGBLOCK) { 593 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i))) 594 return error; 595 if (XFS_IS_CORRUPT(mp, i != 0)) { 596 xfs_btree_mark_sick(cnt_cur); 597 return -EFSCORRUPTED; 598 } 599 if ((error = xfs_btree_insert(cnt_cur, &i))) 600 return error; 601 if (XFS_IS_CORRUPT(mp, i != 1)) { 602 xfs_btree_mark_sick(cnt_cur); 603 return -EFSCORRUPTED; 604 } 605 } 606 if (nfbno2 != NULLAGBLOCK) { 607 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i))) 608 return error; 609 if (XFS_IS_CORRUPT(mp, i != 0)) { 610 xfs_btree_mark_sick(cnt_cur); 611 return -EFSCORRUPTED; 612 } 613 if ((error = xfs_btree_insert(cnt_cur, &i))) 614 return error; 615 if (XFS_IS_CORRUPT(mp, i != 1)) { 616 xfs_btree_mark_sick(cnt_cur); 617 return -EFSCORRUPTED; 618 } 619 } 620 /* 621 * Fix up the by-block btree entry(s). 622 */ 623 if (nfbno1 == NULLAGBLOCK) { 624 /* 625 * No remaining freespace, just delete the by-block tree entry. 626 */ 627 if ((error = xfs_btree_delete(bno_cur, &i))) 628 return error; 629 if (XFS_IS_CORRUPT(mp, i != 1)) { 630 xfs_btree_mark_sick(bno_cur); 631 return -EFSCORRUPTED; 632 } 633 } else { 634 /* 635 * Update the by-block entry to start later|be shorter. 636 */ 637 if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1))) 638 return error; 639 } 640 if (nfbno2 != NULLAGBLOCK) { 641 /* 642 * 2 resulting free entries, need to add one. 643 */ 644 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i))) 645 return error; 646 if (XFS_IS_CORRUPT(mp, i != 0)) { 647 xfs_btree_mark_sick(bno_cur); 648 return -EFSCORRUPTED; 649 } 650 if ((error = xfs_btree_insert(bno_cur, &i))) 651 return error; 652 if (XFS_IS_CORRUPT(mp, i != 1)) { 653 xfs_btree_mark_sick(bno_cur); 654 return -EFSCORRUPTED; 655 } 656 } 657 return 0; 658 } 659 660 /* 661 * We do not verify the AGFL contents against AGF-based index counters here, 662 * even though we may have access to the perag that contains shadow copies. We 663 * don't know if the AGF based counters have been checked, and if they have they 664 * still may be inconsistent because they haven't yet been reset on the first 665 * allocation after the AGF has been read in. 666 * 667 * This means we can only check that all agfl entries contain valid or null 668 * values because we can't reliably determine the active range to exclude 669 * NULLAGBNO as a valid value. 670 * 671 * However, we can't even do that for v4 format filesystems because there are 672 * old versions of mkfs out there that does not initialise the AGFL to known, 673 * verifiable values. HEnce we can't tell the difference between a AGFL block 674 * allocated by mkfs and a corrupted AGFL block here on v4 filesystems. 675 * 676 * As a result, we can only fully validate AGFL block numbers when we pull them 677 * from the freelist in xfs_alloc_get_freelist(). 678 */ 679 static xfs_failaddr_t 680 xfs_agfl_verify( 681 struct xfs_buf *bp) 682 { 683 struct xfs_mount *mp = bp->b_mount; 684 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp); 685 __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp); 686 int i; 687 688 if (!xfs_has_crc(mp)) 689 return NULL; 690 691 if (!xfs_verify_magic(bp, agfl->agfl_magicnum)) 692 return __this_address; 693 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid)) 694 return __this_address; 695 /* 696 * during growfs operations, the perag is not fully initialised, 697 * so we can't use it for any useful checking. growfs ensures we can't 698 * use it by using uncached buffers that don't have the perag attached 699 * so we can detect and avoid this problem. 700 */ 701 if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno) 702 return __this_address; 703 704 for (i = 0; i < xfs_agfl_size(mp); i++) { 705 if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK && 706 be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks) 707 return __this_address; 708 } 709 710 if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn))) 711 return __this_address; 712 return NULL; 713 } 714 715 static void 716 xfs_agfl_read_verify( 717 struct xfs_buf *bp) 718 { 719 struct xfs_mount *mp = bp->b_mount; 720 xfs_failaddr_t fa; 721 722 /* 723 * There is no verification of non-crc AGFLs because mkfs does not 724 * initialise the AGFL to zero or NULL. Hence the only valid part of the 725 * AGFL is what the AGF says is active. We can't get to the AGF, so we 726 * can't verify just those entries are valid. 727 */ 728 if (!xfs_has_crc(mp)) 729 return; 730 731 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF)) 732 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 733 else { 734 fa = xfs_agfl_verify(bp); 735 if (fa) 736 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 737 } 738 } 739 740 static void 741 xfs_agfl_write_verify( 742 struct xfs_buf *bp) 743 { 744 struct xfs_mount *mp = bp->b_mount; 745 struct xfs_buf_log_item *bip = bp->b_log_item; 746 xfs_failaddr_t fa; 747 748 /* no verification of non-crc AGFLs */ 749 if (!xfs_has_crc(mp)) 750 return; 751 752 fa = xfs_agfl_verify(bp); 753 if (fa) { 754 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 755 return; 756 } 757 758 if (bip) 759 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn); 760 761 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF); 762 } 763 764 const struct xfs_buf_ops xfs_agfl_buf_ops = { 765 .name = "xfs_agfl", 766 .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) }, 767 .verify_read = xfs_agfl_read_verify, 768 .verify_write = xfs_agfl_write_verify, 769 .verify_struct = xfs_agfl_verify, 770 }; 771 772 /* 773 * Read in the allocation group free block array. 774 */ 775 int 776 xfs_alloc_read_agfl( 777 struct xfs_perag *pag, 778 struct xfs_trans *tp, 779 struct xfs_buf **bpp) 780 { 781 struct xfs_mount *mp = pag->pag_mount; 782 struct xfs_buf *bp; 783 int error; 784 785 error = xfs_trans_read_buf( 786 mp, tp, mp->m_ddev_targp, 787 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)), 788 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops); 789 if (xfs_metadata_is_sick(error)) 790 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGFL); 791 if (error) 792 return error; 793 xfs_buf_set_ref(bp, XFS_AGFL_REF); 794 *bpp = bp; 795 return 0; 796 } 797 798 STATIC int 799 xfs_alloc_update_counters( 800 struct xfs_trans *tp, 801 struct xfs_buf *agbp, 802 long len) 803 { 804 struct xfs_agf *agf = agbp->b_addr; 805 806 agbp->b_pag->pagf_freeblks += len; 807 be32_add_cpu(&agf->agf_freeblks, len); 808 809 if (unlikely(be32_to_cpu(agf->agf_freeblks) > 810 be32_to_cpu(agf->agf_length))) { 811 xfs_buf_mark_corrupt(agbp); 812 xfs_ag_mark_sick(agbp->b_pag, XFS_SICK_AG_AGF); 813 return -EFSCORRUPTED; 814 } 815 816 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS); 817 return 0; 818 } 819 820 /* 821 * Block allocation algorithm and data structures. 822 */ 823 struct xfs_alloc_cur { 824 struct xfs_btree_cur *cnt; /* btree cursors */ 825 struct xfs_btree_cur *bnolt; 826 struct xfs_btree_cur *bnogt; 827 xfs_extlen_t cur_len;/* current search length */ 828 xfs_agblock_t rec_bno;/* extent startblock */ 829 xfs_extlen_t rec_len;/* extent length */ 830 xfs_agblock_t bno; /* alloc bno */ 831 xfs_extlen_t len; /* alloc len */ 832 xfs_extlen_t diff; /* diff from search bno */ 833 unsigned int busy_gen;/* busy state */ 834 bool busy; 835 }; 836 837 /* 838 * Set up cursors, etc. in the extent allocation cursor. This function can be 839 * called multiple times to reset an initialized structure without having to 840 * reallocate cursors. 841 */ 842 static int 843 xfs_alloc_cur_setup( 844 struct xfs_alloc_arg *args, 845 struct xfs_alloc_cur *acur) 846 { 847 int error; 848 int i; 849 850 acur->cur_len = args->maxlen; 851 acur->rec_bno = 0; 852 acur->rec_len = 0; 853 acur->bno = 0; 854 acur->len = 0; 855 acur->diff = -1; 856 acur->busy = false; 857 acur->busy_gen = 0; 858 859 /* 860 * Perform an initial cntbt lookup to check for availability of maxlen 861 * extents. If this fails, we'll return -ENOSPC to signal the caller to 862 * attempt a small allocation. 863 */ 864 if (!acur->cnt) 865 acur->cnt = xfs_cntbt_init_cursor(args->mp, args->tp, 866 args->agbp, args->pag); 867 error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i); 868 if (error) 869 return error; 870 871 /* 872 * Allocate the bnobt left and right search cursors. 873 */ 874 if (!acur->bnolt) 875 acur->bnolt = xfs_bnobt_init_cursor(args->mp, args->tp, 876 args->agbp, args->pag); 877 if (!acur->bnogt) 878 acur->bnogt = xfs_bnobt_init_cursor(args->mp, args->tp, 879 args->agbp, args->pag); 880 return i == 1 ? 0 : -ENOSPC; 881 } 882 883 static void 884 xfs_alloc_cur_close( 885 struct xfs_alloc_cur *acur, 886 bool error) 887 { 888 int cur_error = XFS_BTREE_NOERROR; 889 890 if (error) 891 cur_error = XFS_BTREE_ERROR; 892 893 if (acur->cnt) 894 xfs_btree_del_cursor(acur->cnt, cur_error); 895 if (acur->bnolt) 896 xfs_btree_del_cursor(acur->bnolt, cur_error); 897 if (acur->bnogt) 898 xfs_btree_del_cursor(acur->bnogt, cur_error); 899 acur->cnt = acur->bnolt = acur->bnogt = NULL; 900 } 901 902 /* 903 * Check an extent for allocation and track the best available candidate in the 904 * allocation structure. The cursor is deactivated if it has entered an out of 905 * range state based on allocation arguments. Optionally return the extent 906 * extent geometry and allocation status if requested by the caller. 907 */ 908 static int 909 xfs_alloc_cur_check( 910 struct xfs_alloc_arg *args, 911 struct xfs_alloc_cur *acur, 912 struct xfs_btree_cur *cur, 913 int *new) 914 { 915 int error, i; 916 xfs_agblock_t bno, bnoa, bnew; 917 xfs_extlen_t len, lena, diff = -1; 918 bool busy; 919 unsigned busy_gen = 0; 920 bool deactivate = false; 921 bool isbnobt = xfs_btree_is_bno(cur->bc_ops); 922 923 *new = 0; 924 925 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 926 if (error) 927 return error; 928 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 929 xfs_btree_mark_sick(cur); 930 return -EFSCORRUPTED; 931 } 932 933 /* 934 * Check minlen and deactivate a cntbt cursor if out of acceptable size 935 * range (i.e., walking backwards looking for a minlen extent). 936 */ 937 if (len < args->minlen) { 938 deactivate = !isbnobt; 939 goto out; 940 } 941 942 busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena, 943 &busy_gen); 944 acur->busy |= busy; 945 if (busy) 946 acur->busy_gen = busy_gen; 947 /* deactivate a bnobt cursor outside of locality range */ 948 if (bnoa < args->min_agbno || bnoa > args->max_agbno) { 949 deactivate = isbnobt; 950 goto out; 951 } 952 if (lena < args->minlen) 953 goto out; 954 955 args->len = XFS_EXTLEN_MIN(lena, args->maxlen); 956 xfs_alloc_fix_len(args); 957 ASSERT(args->len >= args->minlen); 958 if (args->len < acur->len) 959 goto out; 960 961 /* 962 * We have an aligned record that satisfies minlen and beats or matches 963 * the candidate extent size. Compare locality for near allocation mode. 964 */ 965 diff = xfs_alloc_compute_diff(args->agbno, args->len, 966 args->alignment, args->datatype, 967 bnoa, lena, &bnew); 968 if (bnew == NULLAGBLOCK) 969 goto out; 970 971 /* 972 * Deactivate a bnobt cursor with worse locality than the current best. 973 */ 974 if (diff > acur->diff) { 975 deactivate = isbnobt; 976 goto out; 977 } 978 979 ASSERT(args->len > acur->len || 980 (args->len == acur->len && diff <= acur->diff)); 981 acur->rec_bno = bno; 982 acur->rec_len = len; 983 acur->bno = bnew; 984 acur->len = args->len; 985 acur->diff = diff; 986 *new = 1; 987 988 /* 989 * We're done if we found a perfect allocation. This only deactivates 990 * the current cursor, but this is just an optimization to terminate a 991 * cntbt search that otherwise runs to the edge of the tree. 992 */ 993 if (acur->diff == 0 && acur->len == args->maxlen) 994 deactivate = true; 995 out: 996 if (deactivate) 997 cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; 998 trace_xfs_alloc_cur_check(cur, bno, len, diff, *new); 999 return 0; 1000 } 1001 1002 /* 1003 * Complete an allocation of a candidate extent. Remove the extent from both 1004 * trees and update the args structure. 1005 */ 1006 STATIC int 1007 xfs_alloc_cur_finish( 1008 struct xfs_alloc_arg *args, 1009 struct xfs_alloc_cur *acur) 1010 { 1011 int error; 1012 1013 ASSERT(acur->cnt && acur->bnolt); 1014 ASSERT(acur->bno >= acur->rec_bno); 1015 ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len); 1016 ASSERT(xfs_verify_agbext(args->pag, acur->rec_bno, acur->rec_len)); 1017 1018 error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno, 1019 acur->rec_len, acur->bno, acur->len, 0); 1020 if (error) 1021 return error; 1022 1023 args->agbno = acur->bno; 1024 args->len = acur->len; 1025 args->wasfromfl = 0; 1026 1027 trace_xfs_alloc_cur(args); 1028 return 0; 1029 } 1030 1031 /* 1032 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses 1033 * bno optimized lookup to search for extents with ideal size and locality. 1034 */ 1035 STATIC int 1036 xfs_alloc_cntbt_iter( 1037 struct xfs_alloc_arg *args, 1038 struct xfs_alloc_cur *acur) 1039 { 1040 struct xfs_btree_cur *cur = acur->cnt; 1041 xfs_agblock_t bno; 1042 xfs_extlen_t len, cur_len; 1043 int error; 1044 int i; 1045 1046 if (!xfs_alloc_cur_active(cur)) 1047 return 0; 1048 1049 /* locality optimized lookup */ 1050 cur_len = acur->cur_len; 1051 error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i); 1052 if (error) 1053 return error; 1054 if (i == 0) 1055 return 0; 1056 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 1057 if (error) 1058 return error; 1059 1060 /* check the current record and update search length from it */ 1061 error = xfs_alloc_cur_check(args, acur, cur, &i); 1062 if (error) 1063 return error; 1064 ASSERT(len >= acur->cur_len); 1065 acur->cur_len = len; 1066 1067 /* 1068 * We looked up the first record >= [agbno, len] above. The agbno is a 1069 * secondary key and so the current record may lie just before or after 1070 * agbno. If it is past agbno, check the previous record too so long as 1071 * the length matches as it may be closer. Don't check a smaller record 1072 * because that could deactivate our cursor. 1073 */ 1074 if (bno > args->agbno) { 1075 error = xfs_btree_decrement(cur, 0, &i); 1076 if (!error && i) { 1077 error = xfs_alloc_get_rec(cur, &bno, &len, &i); 1078 if (!error && i && len == acur->cur_len) 1079 error = xfs_alloc_cur_check(args, acur, cur, 1080 &i); 1081 } 1082 if (error) 1083 return error; 1084 } 1085 1086 /* 1087 * Increment the search key until we find at least one allocation 1088 * candidate or if the extent we found was larger. Otherwise, double the 1089 * search key to optimize the search. Efficiency is more important here 1090 * than absolute best locality. 1091 */ 1092 cur_len <<= 1; 1093 if (!acur->len || acur->cur_len >= cur_len) 1094 acur->cur_len++; 1095 else 1096 acur->cur_len = cur_len; 1097 1098 return error; 1099 } 1100 1101 /* 1102 * Deal with the case where only small freespaces remain. Either return the 1103 * contents of the last freespace record, or allocate space from the freelist if 1104 * there is nothing in the tree. 1105 */ 1106 STATIC int /* error */ 1107 xfs_alloc_ag_vextent_small( 1108 struct xfs_alloc_arg *args, /* allocation argument structure */ 1109 struct xfs_btree_cur *ccur, /* optional by-size cursor */ 1110 xfs_agblock_t *fbnop, /* result block number */ 1111 xfs_extlen_t *flenp, /* result length */ 1112 int *stat) /* status: 0-freelist, 1-normal/none */ 1113 { 1114 struct xfs_agf *agf = args->agbp->b_addr; 1115 int error = 0; 1116 xfs_agblock_t fbno = NULLAGBLOCK; 1117 xfs_extlen_t flen = 0; 1118 int i = 0; 1119 1120 /* 1121 * If a cntbt cursor is provided, try to allocate the largest record in 1122 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the 1123 * allocation. Make sure to respect minleft even when pulling from the 1124 * freelist. 1125 */ 1126 if (ccur) 1127 error = xfs_btree_decrement(ccur, 0, &i); 1128 if (error) 1129 goto error; 1130 if (i) { 1131 error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i); 1132 if (error) 1133 goto error; 1134 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1135 xfs_btree_mark_sick(ccur); 1136 error = -EFSCORRUPTED; 1137 goto error; 1138 } 1139 goto out; 1140 } 1141 1142 if (args->minlen != 1 || args->alignment != 1 || 1143 args->resv == XFS_AG_RESV_AGFL || 1144 be32_to_cpu(agf->agf_flcount) <= args->minleft) 1145 goto out; 1146 1147 error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp, 1148 &fbno, 0); 1149 if (error) 1150 goto error; 1151 if (fbno == NULLAGBLOCK) 1152 goto out; 1153 1154 xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1, 1155 (args->datatype & XFS_ALLOC_NOBUSY)); 1156 1157 if (args->datatype & XFS_ALLOC_USERDATA) { 1158 struct xfs_buf *bp; 1159 1160 error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp, 1161 XFS_AGB_TO_DADDR(args->mp, args->agno, fbno), 1162 args->mp->m_bsize, 0, &bp); 1163 if (error) 1164 goto error; 1165 xfs_trans_binval(args->tp, bp); 1166 } 1167 *fbnop = args->agbno = fbno; 1168 *flenp = args->len = 1; 1169 if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) { 1170 xfs_btree_mark_sick(ccur); 1171 error = -EFSCORRUPTED; 1172 goto error; 1173 } 1174 args->wasfromfl = 1; 1175 trace_xfs_alloc_small_freelist(args); 1176 1177 /* 1178 * If we're feeding an AGFL block to something that doesn't live in the 1179 * free space, we need to clear out the OWN_AG rmap. 1180 */ 1181 error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1, 1182 &XFS_RMAP_OINFO_AG); 1183 if (error) 1184 goto error; 1185 1186 *stat = 0; 1187 return 0; 1188 1189 out: 1190 /* 1191 * Can't do the allocation, give up. 1192 */ 1193 if (flen < args->minlen) { 1194 args->agbno = NULLAGBLOCK; 1195 trace_xfs_alloc_small_notenough(args); 1196 flen = 0; 1197 } 1198 *fbnop = fbno; 1199 *flenp = flen; 1200 *stat = 1; 1201 trace_xfs_alloc_small_done(args); 1202 return 0; 1203 1204 error: 1205 trace_xfs_alloc_small_error(args); 1206 return error; 1207 } 1208 1209 /* 1210 * Allocate a variable extent at exactly agno/bno. 1211 * Extent's length (returned in *len) will be between minlen and maxlen, 1212 * and of the form k * prod + mod unless there's nothing that large. 1213 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it. 1214 */ 1215 STATIC int /* error */ 1216 xfs_alloc_ag_vextent_exact( 1217 xfs_alloc_arg_t *args) /* allocation argument structure */ 1218 { 1219 struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */ 1220 struct xfs_btree_cur *cnt_cur;/* by count btree cursor */ 1221 int error; 1222 xfs_agblock_t fbno; /* start block of found extent */ 1223 xfs_extlen_t flen; /* length of found extent */ 1224 xfs_agblock_t tbno; /* start block of busy extent */ 1225 xfs_extlen_t tlen; /* length of busy extent */ 1226 xfs_agblock_t tend; /* end block of busy extent */ 1227 int i; /* success/failure of operation */ 1228 unsigned busy_gen; 1229 1230 ASSERT(args->alignment == 1); 1231 1232 /* 1233 * Allocate/initialize a cursor for the by-number freespace btree. 1234 */ 1235 bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp, 1236 args->pag); 1237 1238 /* 1239 * Lookup bno and minlen in the btree (minlen is irrelevant, really). 1240 * Look for the closest free block <= bno, it must contain bno 1241 * if any free block does. 1242 */ 1243 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i); 1244 if (error) 1245 goto error0; 1246 if (!i) 1247 goto not_found; 1248 1249 /* 1250 * Grab the freespace record. 1251 */ 1252 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i); 1253 if (error) 1254 goto error0; 1255 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1256 xfs_btree_mark_sick(bno_cur); 1257 error = -EFSCORRUPTED; 1258 goto error0; 1259 } 1260 ASSERT(fbno <= args->agbno); 1261 1262 /* 1263 * Check for overlapping busy extents. 1264 */ 1265 tbno = fbno; 1266 tlen = flen; 1267 xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen); 1268 1269 /* 1270 * Give up if the start of the extent is busy, or the freespace isn't 1271 * long enough for the minimum request. 1272 */ 1273 if (tbno > args->agbno) 1274 goto not_found; 1275 if (tlen < args->minlen) 1276 goto not_found; 1277 tend = tbno + tlen; 1278 if (tend < args->agbno + args->minlen) 1279 goto not_found; 1280 1281 /* 1282 * End of extent will be smaller of the freespace end and the 1283 * maximal requested end. 1284 * 1285 * Fix the length according to mod and prod if given. 1286 */ 1287 args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen) 1288 - args->agbno; 1289 xfs_alloc_fix_len(args); 1290 ASSERT(args->agbno + args->len <= tend); 1291 1292 /* 1293 * We are allocating agbno for args->len 1294 * Allocate/initialize a cursor for the by-size btree. 1295 */ 1296 cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, 1297 args->pag); 1298 ASSERT(xfs_verify_agbext(args->pag, args->agbno, args->len)); 1299 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno, 1300 args->len, XFSA_FIXUP_BNO_OK); 1301 if (error) { 1302 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 1303 goto error0; 1304 } 1305 1306 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1307 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1308 1309 args->wasfromfl = 0; 1310 trace_xfs_alloc_exact_done(args); 1311 return 0; 1312 1313 not_found: 1314 /* Didn't find it, return null. */ 1315 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1316 args->agbno = NULLAGBLOCK; 1317 trace_xfs_alloc_exact_notfound(args); 1318 return 0; 1319 1320 error0: 1321 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1322 trace_xfs_alloc_exact_error(args); 1323 return error; 1324 } 1325 1326 /* 1327 * Search a given number of btree records in a given direction. Check each 1328 * record against the good extent we've already found. 1329 */ 1330 STATIC int 1331 xfs_alloc_walk_iter( 1332 struct xfs_alloc_arg *args, 1333 struct xfs_alloc_cur *acur, 1334 struct xfs_btree_cur *cur, 1335 bool increment, 1336 bool find_one, /* quit on first candidate */ 1337 int count, /* rec count (-1 for infinite) */ 1338 int *stat) 1339 { 1340 int error; 1341 int i; 1342 1343 *stat = 0; 1344 1345 /* 1346 * Search so long as the cursor is active or we find a better extent. 1347 * The cursor is deactivated if it extends beyond the range of the 1348 * current allocation candidate. 1349 */ 1350 while (xfs_alloc_cur_active(cur) && count) { 1351 error = xfs_alloc_cur_check(args, acur, cur, &i); 1352 if (error) 1353 return error; 1354 if (i == 1) { 1355 *stat = 1; 1356 if (find_one) 1357 break; 1358 } 1359 if (!xfs_alloc_cur_active(cur)) 1360 break; 1361 1362 if (increment) 1363 error = xfs_btree_increment(cur, 0, &i); 1364 else 1365 error = xfs_btree_decrement(cur, 0, &i); 1366 if (error) 1367 return error; 1368 if (i == 0) 1369 cur->bc_flags &= ~XFS_BTREE_ALLOCBT_ACTIVE; 1370 1371 if (count > 0) 1372 count--; 1373 } 1374 1375 return 0; 1376 } 1377 1378 /* 1379 * Search the by-bno and by-size btrees in parallel in search of an extent with 1380 * ideal locality based on the NEAR mode ->agbno locality hint. 1381 */ 1382 STATIC int 1383 xfs_alloc_ag_vextent_locality( 1384 struct xfs_alloc_arg *args, 1385 struct xfs_alloc_cur *acur, 1386 int *stat) 1387 { 1388 struct xfs_btree_cur *fbcur = NULL; 1389 int error; 1390 int i; 1391 bool fbinc; 1392 1393 ASSERT(acur->len == 0); 1394 1395 *stat = 0; 1396 1397 error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i); 1398 if (error) 1399 return error; 1400 error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i); 1401 if (error) 1402 return error; 1403 error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i); 1404 if (error) 1405 return error; 1406 1407 /* 1408 * Search the bnobt and cntbt in parallel. Search the bnobt left and 1409 * right and lookup the closest extent to the locality hint for each 1410 * extent size key in the cntbt. The entire search terminates 1411 * immediately on a bnobt hit because that means we've found best case 1412 * locality. Otherwise the search continues until the cntbt cursor runs 1413 * off the end of the tree. If no allocation candidate is found at this 1414 * point, give up on locality, walk backwards from the end of the cntbt 1415 * and take the first available extent. 1416 * 1417 * The parallel tree searches balance each other out to provide fairly 1418 * consistent performance for various situations. The bnobt search can 1419 * have pathological behavior in the worst case scenario of larger 1420 * allocation requests and fragmented free space. On the other hand, the 1421 * bnobt is able to satisfy most smaller allocation requests much more 1422 * quickly than the cntbt. The cntbt search can sift through fragmented 1423 * free space and sets of free extents for larger allocation requests 1424 * more quickly than the bnobt. Since the locality hint is just a hint 1425 * and we don't want to scan the entire bnobt for perfect locality, the 1426 * cntbt search essentially bounds the bnobt search such that we can 1427 * find good enough locality at reasonable performance in most cases. 1428 */ 1429 while (xfs_alloc_cur_active(acur->bnolt) || 1430 xfs_alloc_cur_active(acur->bnogt) || 1431 xfs_alloc_cur_active(acur->cnt)) { 1432 1433 trace_xfs_alloc_cur_lookup(args); 1434 1435 /* 1436 * Search the bnobt left and right. In the case of a hit, finish 1437 * the search in the opposite direction and we're done. 1438 */ 1439 error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false, 1440 true, 1, &i); 1441 if (error) 1442 return error; 1443 if (i == 1) { 1444 trace_xfs_alloc_cur_left(args); 1445 fbcur = acur->bnogt; 1446 fbinc = true; 1447 break; 1448 } 1449 error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true, 1450 1, &i); 1451 if (error) 1452 return error; 1453 if (i == 1) { 1454 trace_xfs_alloc_cur_right(args); 1455 fbcur = acur->bnolt; 1456 fbinc = false; 1457 break; 1458 } 1459 1460 /* 1461 * Check the extent with best locality based on the current 1462 * extent size search key and keep track of the best candidate. 1463 */ 1464 error = xfs_alloc_cntbt_iter(args, acur); 1465 if (error) 1466 return error; 1467 if (!xfs_alloc_cur_active(acur->cnt)) { 1468 trace_xfs_alloc_cur_lookup_done(args); 1469 break; 1470 } 1471 } 1472 1473 /* 1474 * If we failed to find anything due to busy extents, return empty 1475 * handed so the caller can flush and retry. If no busy extents were 1476 * found, walk backwards from the end of the cntbt as a last resort. 1477 */ 1478 if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) { 1479 error = xfs_btree_decrement(acur->cnt, 0, &i); 1480 if (error) 1481 return error; 1482 if (i) { 1483 acur->cnt->bc_flags |= XFS_BTREE_ALLOCBT_ACTIVE; 1484 fbcur = acur->cnt; 1485 fbinc = false; 1486 } 1487 } 1488 1489 /* 1490 * Search in the opposite direction for a better entry in the case of 1491 * a bnobt hit or walk backwards from the end of the cntbt. 1492 */ 1493 if (fbcur) { 1494 error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1, 1495 &i); 1496 if (error) 1497 return error; 1498 } 1499 1500 if (acur->len) 1501 *stat = 1; 1502 1503 return 0; 1504 } 1505 1506 /* Check the last block of the cnt btree for allocations. */ 1507 static int 1508 xfs_alloc_ag_vextent_lastblock( 1509 struct xfs_alloc_arg *args, 1510 struct xfs_alloc_cur *acur, 1511 xfs_agblock_t *bno, 1512 xfs_extlen_t *len, 1513 bool *allocated) 1514 { 1515 int error; 1516 int i; 1517 1518 #ifdef DEBUG 1519 /* Randomly don't execute the first algorithm. */ 1520 if (get_random_u32_below(2)) 1521 return 0; 1522 #endif 1523 1524 /* 1525 * Start from the entry that lookup found, sequence through all larger 1526 * free blocks. If we're actually pointing at a record smaller than 1527 * maxlen, go to the start of this block, and skip all those smaller 1528 * than minlen. 1529 */ 1530 if (*len || args->alignment > 1) { 1531 acur->cnt->bc_levels[0].ptr = 1; 1532 do { 1533 error = xfs_alloc_get_rec(acur->cnt, bno, len, &i); 1534 if (error) 1535 return error; 1536 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1537 xfs_btree_mark_sick(acur->cnt); 1538 return -EFSCORRUPTED; 1539 } 1540 if (*len >= args->minlen) 1541 break; 1542 error = xfs_btree_increment(acur->cnt, 0, &i); 1543 if (error) 1544 return error; 1545 } while (i); 1546 ASSERT(*len >= args->minlen); 1547 if (!i) 1548 return 0; 1549 } 1550 1551 error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i); 1552 if (error) 1553 return error; 1554 1555 /* 1556 * It didn't work. We COULD be in a case where there's a good record 1557 * somewhere, so try again. 1558 */ 1559 if (acur->len == 0) 1560 return 0; 1561 1562 trace_xfs_alloc_near_first(args); 1563 *allocated = true; 1564 return 0; 1565 } 1566 1567 /* 1568 * Allocate a variable extent near bno in the allocation group agno. 1569 * Extent's length (returned in len) will be between minlen and maxlen, 1570 * and of the form k * prod + mod unless there's nothing that large. 1571 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1572 */ 1573 STATIC int 1574 xfs_alloc_ag_vextent_near( 1575 struct xfs_alloc_arg *args, 1576 uint32_t alloc_flags) 1577 { 1578 struct xfs_alloc_cur acur = {}; 1579 int error; /* error code */ 1580 int i; /* result code, temporary */ 1581 xfs_agblock_t bno; 1582 xfs_extlen_t len; 1583 1584 /* handle uninitialized agbno range so caller doesn't have to */ 1585 if (!args->min_agbno && !args->max_agbno) 1586 args->max_agbno = args->mp->m_sb.sb_agblocks - 1; 1587 ASSERT(args->min_agbno <= args->max_agbno); 1588 1589 /* clamp agbno to the range if it's outside */ 1590 if (args->agbno < args->min_agbno) 1591 args->agbno = args->min_agbno; 1592 if (args->agbno > args->max_agbno) 1593 args->agbno = args->max_agbno; 1594 1595 /* Retry once quickly if we find busy extents before blocking. */ 1596 alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH; 1597 restart: 1598 len = 0; 1599 1600 /* 1601 * Set up cursors and see if there are any free extents as big as 1602 * maxlen. If not, pick the last entry in the tree unless the tree is 1603 * empty. 1604 */ 1605 error = xfs_alloc_cur_setup(args, &acur); 1606 if (error == -ENOSPC) { 1607 error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno, 1608 &len, &i); 1609 if (error) 1610 goto out; 1611 if (i == 0 || len == 0) { 1612 trace_xfs_alloc_near_noentry(args); 1613 goto out; 1614 } 1615 ASSERT(i == 1); 1616 } else if (error) { 1617 goto out; 1618 } 1619 1620 /* 1621 * First algorithm. 1622 * If the requested extent is large wrt the freespaces available 1623 * in this a.g., then the cursor will be pointing to a btree entry 1624 * near the right edge of the tree. If it's in the last btree leaf 1625 * block, then we just examine all the entries in that block 1626 * that are big enough, and pick the best one. 1627 */ 1628 if (xfs_btree_islastblock(acur.cnt, 0)) { 1629 bool allocated = false; 1630 1631 error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len, 1632 &allocated); 1633 if (error) 1634 goto out; 1635 if (allocated) 1636 goto alloc_finish; 1637 } 1638 1639 /* 1640 * Second algorithm. Combined cntbt and bnobt search to find ideal 1641 * locality. 1642 */ 1643 error = xfs_alloc_ag_vextent_locality(args, &acur, &i); 1644 if (error) 1645 goto out; 1646 1647 /* 1648 * If we couldn't get anything, give up. 1649 */ 1650 if (!acur.len) { 1651 if (acur.busy) { 1652 /* 1653 * Our only valid extents must have been busy. Flush and 1654 * retry the allocation again. If we get an -EAGAIN 1655 * error, we're being told that a deadlock was avoided 1656 * and the current transaction needs committing before 1657 * the allocation can be retried. 1658 */ 1659 trace_xfs_alloc_near_busy(args); 1660 error = xfs_extent_busy_flush(args->tp, args->pag, 1661 acur.busy_gen, alloc_flags); 1662 if (error) 1663 goto out; 1664 1665 alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; 1666 goto restart; 1667 } 1668 trace_xfs_alloc_size_neither(args); 1669 args->agbno = NULLAGBLOCK; 1670 goto out; 1671 } 1672 1673 alloc_finish: 1674 /* fix up btrees on a successful allocation */ 1675 error = xfs_alloc_cur_finish(args, &acur); 1676 1677 out: 1678 xfs_alloc_cur_close(&acur, error); 1679 return error; 1680 } 1681 1682 /* 1683 * Allocate a variable extent anywhere in the allocation group agno. 1684 * Extent's length (returned in len) will be between minlen and maxlen, 1685 * and of the form k * prod + mod unless there's nothing that large. 1686 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it. 1687 */ 1688 static int 1689 xfs_alloc_ag_vextent_size( 1690 struct xfs_alloc_arg *args, 1691 uint32_t alloc_flags) 1692 { 1693 struct xfs_agf *agf = args->agbp->b_addr; 1694 struct xfs_btree_cur *bno_cur; 1695 struct xfs_btree_cur *cnt_cur; 1696 xfs_agblock_t fbno; /* start of found freespace */ 1697 xfs_extlen_t flen; /* length of found freespace */ 1698 xfs_agblock_t rbno; /* returned block number */ 1699 xfs_extlen_t rlen; /* length of returned extent */ 1700 bool busy; 1701 unsigned busy_gen; 1702 int error; 1703 int i; 1704 1705 /* Retry once quickly if we find busy extents before blocking. */ 1706 alloc_flags |= XFS_ALLOC_FLAG_TRYFLUSH; 1707 restart: 1708 /* 1709 * Allocate and initialize a cursor for the by-size btree. 1710 */ 1711 cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, args->agbp, 1712 args->pag); 1713 bno_cur = NULL; 1714 1715 /* 1716 * Look for an entry >= maxlen+alignment-1 blocks. 1717 */ 1718 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0, 1719 args->maxlen + args->alignment - 1, &i))) 1720 goto error0; 1721 1722 /* 1723 * If none then we have to settle for a smaller extent. In the case that 1724 * there are no large extents, this will return the last entry in the 1725 * tree unless the tree is empty. In the case that there are only busy 1726 * large extents, this will return the largest small extent unless there 1727 * are no smaller extents available. 1728 */ 1729 if (!i) { 1730 error = xfs_alloc_ag_vextent_small(args, cnt_cur, 1731 &fbno, &flen, &i); 1732 if (error) 1733 goto error0; 1734 if (i == 0 || flen == 0) { 1735 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1736 trace_xfs_alloc_size_noentry(args); 1737 return 0; 1738 } 1739 ASSERT(i == 1); 1740 busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno, 1741 &rlen, &busy_gen); 1742 } else { 1743 /* 1744 * Search for a non-busy extent that is large enough. 1745 */ 1746 for (;;) { 1747 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i); 1748 if (error) 1749 goto error0; 1750 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1751 xfs_btree_mark_sick(cnt_cur); 1752 error = -EFSCORRUPTED; 1753 goto error0; 1754 } 1755 1756 busy = xfs_alloc_compute_aligned(args, fbno, flen, 1757 &rbno, &rlen, &busy_gen); 1758 1759 if (rlen >= args->maxlen) 1760 break; 1761 1762 error = xfs_btree_increment(cnt_cur, 0, &i); 1763 if (error) 1764 goto error0; 1765 if (i) 1766 continue; 1767 1768 /* 1769 * Our only valid extents must have been busy. Flush and 1770 * retry the allocation again. If we get an -EAGAIN 1771 * error, we're being told that a deadlock was avoided 1772 * and the current transaction needs committing before 1773 * the allocation can be retried. 1774 */ 1775 trace_xfs_alloc_size_busy(args); 1776 error = xfs_extent_busy_flush(args->tp, args->pag, 1777 busy_gen, alloc_flags); 1778 if (error) 1779 goto error0; 1780 1781 alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; 1782 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1783 goto restart; 1784 } 1785 } 1786 1787 /* 1788 * In the first case above, we got the last entry in the 1789 * by-size btree. Now we check to see if the space hits maxlen 1790 * once aligned; if not, we search left for something better. 1791 * This can't happen in the second case above. 1792 */ 1793 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1794 if (XFS_IS_CORRUPT(args->mp, 1795 rlen != 0 && 1796 (rlen > flen || 1797 rbno + rlen > fbno + flen))) { 1798 xfs_btree_mark_sick(cnt_cur); 1799 error = -EFSCORRUPTED; 1800 goto error0; 1801 } 1802 if (rlen < args->maxlen) { 1803 xfs_agblock_t bestfbno; 1804 xfs_extlen_t bestflen; 1805 xfs_agblock_t bestrbno; 1806 xfs_extlen_t bestrlen; 1807 1808 bestrlen = rlen; 1809 bestrbno = rbno; 1810 bestflen = flen; 1811 bestfbno = fbno; 1812 for (;;) { 1813 if ((error = xfs_btree_decrement(cnt_cur, 0, &i))) 1814 goto error0; 1815 if (i == 0) 1816 break; 1817 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, 1818 &i))) 1819 goto error0; 1820 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1821 xfs_btree_mark_sick(cnt_cur); 1822 error = -EFSCORRUPTED; 1823 goto error0; 1824 } 1825 if (flen < bestrlen) 1826 break; 1827 busy = xfs_alloc_compute_aligned(args, fbno, flen, 1828 &rbno, &rlen, &busy_gen); 1829 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen); 1830 if (XFS_IS_CORRUPT(args->mp, 1831 rlen != 0 && 1832 (rlen > flen || 1833 rbno + rlen > fbno + flen))) { 1834 xfs_btree_mark_sick(cnt_cur); 1835 error = -EFSCORRUPTED; 1836 goto error0; 1837 } 1838 if (rlen > bestrlen) { 1839 bestrlen = rlen; 1840 bestrbno = rbno; 1841 bestflen = flen; 1842 bestfbno = fbno; 1843 if (rlen == args->maxlen) 1844 break; 1845 } 1846 } 1847 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen, 1848 &i))) 1849 goto error0; 1850 if (XFS_IS_CORRUPT(args->mp, i != 1)) { 1851 xfs_btree_mark_sick(cnt_cur); 1852 error = -EFSCORRUPTED; 1853 goto error0; 1854 } 1855 rlen = bestrlen; 1856 rbno = bestrbno; 1857 flen = bestflen; 1858 fbno = bestfbno; 1859 } 1860 args->wasfromfl = 0; 1861 /* 1862 * Fix up the length. 1863 */ 1864 args->len = rlen; 1865 if (rlen < args->minlen) { 1866 if (busy) { 1867 /* 1868 * Our only valid extents must have been busy. Flush and 1869 * retry the allocation again. If we get an -EAGAIN 1870 * error, we're being told that a deadlock was avoided 1871 * and the current transaction needs committing before 1872 * the allocation can be retried. 1873 */ 1874 trace_xfs_alloc_size_busy(args); 1875 error = xfs_extent_busy_flush(args->tp, args->pag, 1876 busy_gen, alloc_flags); 1877 if (error) 1878 goto error0; 1879 1880 alloc_flags &= ~XFS_ALLOC_FLAG_TRYFLUSH; 1881 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1882 goto restart; 1883 } 1884 goto out_nominleft; 1885 } 1886 xfs_alloc_fix_len(args); 1887 1888 rlen = args->len; 1889 if (XFS_IS_CORRUPT(args->mp, rlen > flen)) { 1890 xfs_btree_mark_sick(cnt_cur); 1891 error = -EFSCORRUPTED; 1892 goto error0; 1893 } 1894 /* 1895 * Allocate and initialize a cursor for the by-block tree. 1896 */ 1897 bno_cur = xfs_bnobt_init_cursor(args->mp, args->tp, args->agbp, 1898 args->pag); 1899 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, 1900 rbno, rlen, XFSA_FIXUP_CNT_OK))) 1901 goto error0; 1902 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1903 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 1904 cnt_cur = bno_cur = NULL; 1905 args->len = rlen; 1906 args->agbno = rbno; 1907 if (XFS_IS_CORRUPT(args->mp, 1908 args->agbno + args->len > 1909 be32_to_cpu(agf->agf_length))) { 1910 xfs_ag_mark_sick(args->pag, XFS_SICK_AG_BNOBT); 1911 error = -EFSCORRUPTED; 1912 goto error0; 1913 } 1914 trace_xfs_alloc_size_done(args); 1915 return 0; 1916 1917 error0: 1918 trace_xfs_alloc_size_error(args); 1919 if (cnt_cur) 1920 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 1921 if (bno_cur) 1922 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 1923 return error; 1924 1925 out_nominleft: 1926 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 1927 trace_xfs_alloc_size_nominleft(args); 1928 args->agbno = NULLAGBLOCK; 1929 return 0; 1930 } 1931 1932 /* 1933 * Free the extent starting at agno/bno for length. 1934 */ 1935 STATIC int 1936 xfs_free_ag_extent( 1937 struct xfs_trans *tp, 1938 struct xfs_buf *agbp, 1939 xfs_agnumber_t agno, 1940 xfs_agblock_t bno, 1941 xfs_extlen_t len, 1942 const struct xfs_owner_info *oinfo, 1943 enum xfs_ag_resv_type type) 1944 { 1945 struct xfs_mount *mp; 1946 struct xfs_btree_cur *bno_cur; 1947 struct xfs_btree_cur *cnt_cur; 1948 xfs_agblock_t gtbno; /* start of right neighbor */ 1949 xfs_extlen_t gtlen; /* length of right neighbor */ 1950 xfs_agblock_t ltbno; /* start of left neighbor */ 1951 xfs_extlen_t ltlen; /* length of left neighbor */ 1952 xfs_agblock_t nbno; /* new starting block of freesp */ 1953 xfs_extlen_t nlen; /* new length of freespace */ 1954 int haveleft; /* have a left neighbor */ 1955 int haveright; /* have a right neighbor */ 1956 int i; 1957 int error; 1958 struct xfs_perag *pag = agbp->b_pag; 1959 1960 bno_cur = cnt_cur = NULL; 1961 mp = tp->t_mountp; 1962 1963 if (!xfs_rmap_should_skip_owner_update(oinfo)) { 1964 error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo); 1965 if (error) 1966 goto error0; 1967 } 1968 1969 /* 1970 * Allocate and initialize a cursor for the by-block btree. 1971 */ 1972 bno_cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); 1973 /* 1974 * Look for a neighboring block on the left (lower block numbers) 1975 * that is contiguous with this space. 1976 */ 1977 if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft))) 1978 goto error0; 1979 if (haveleft) { 1980 /* 1981 * There is a block to our left. 1982 */ 1983 if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i))) 1984 goto error0; 1985 if (XFS_IS_CORRUPT(mp, i != 1)) { 1986 xfs_btree_mark_sick(bno_cur); 1987 error = -EFSCORRUPTED; 1988 goto error0; 1989 } 1990 /* 1991 * It's not contiguous, though. 1992 */ 1993 if (ltbno + ltlen < bno) 1994 haveleft = 0; 1995 else { 1996 /* 1997 * If this failure happens the request to free this 1998 * space was invalid, it's (partly) already free. 1999 * Very bad. 2000 */ 2001 if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) { 2002 xfs_btree_mark_sick(bno_cur); 2003 error = -EFSCORRUPTED; 2004 goto error0; 2005 } 2006 } 2007 } 2008 /* 2009 * Look for a neighboring block on the right (higher block numbers) 2010 * that is contiguous with this space. 2011 */ 2012 if ((error = xfs_btree_increment(bno_cur, 0, &haveright))) 2013 goto error0; 2014 if (haveright) { 2015 /* 2016 * There is a block to our right. 2017 */ 2018 if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i))) 2019 goto error0; 2020 if (XFS_IS_CORRUPT(mp, i != 1)) { 2021 xfs_btree_mark_sick(bno_cur); 2022 error = -EFSCORRUPTED; 2023 goto error0; 2024 } 2025 /* 2026 * It's not contiguous, though. 2027 */ 2028 if (bno + len < gtbno) 2029 haveright = 0; 2030 else { 2031 /* 2032 * If this failure happens the request to free this 2033 * space was invalid, it's (partly) already free. 2034 * Very bad. 2035 */ 2036 if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) { 2037 xfs_btree_mark_sick(bno_cur); 2038 error = -EFSCORRUPTED; 2039 goto error0; 2040 } 2041 } 2042 } 2043 /* 2044 * Now allocate and initialize a cursor for the by-size tree. 2045 */ 2046 cnt_cur = xfs_cntbt_init_cursor(mp, tp, agbp, pag); 2047 /* 2048 * Have both left and right contiguous neighbors. 2049 * Merge all three into a single free block. 2050 */ 2051 if (haveleft && haveright) { 2052 /* 2053 * Delete the old by-size entry on the left. 2054 */ 2055 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 2056 goto error0; 2057 if (XFS_IS_CORRUPT(mp, i != 1)) { 2058 xfs_btree_mark_sick(cnt_cur); 2059 error = -EFSCORRUPTED; 2060 goto error0; 2061 } 2062 if ((error = xfs_btree_delete(cnt_cur, &i))) 2063 goto error0; 2064 if (XFS_IS_CORRUPT(mp, i != 1)) { 2065 xfs_btree_mark_sick(cnt_cur); 2066 error = -EFSCORRUPTED; 2067 goto error0; 2068 } 2069 /* 2070 * Delete the old by-size entry on the right. 2071 */ 2072 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 2073 goto error0; 2074 if (XFS_IS_CORRUPT(mp, i != 1)) { 2075 xfs_btree_mark_sick(cnt_cur); 2076 error = -EFSCORRUPTED; 2077 goto error0; 2078 } 2079 if ((error = xfs_btree_delete(cnt_cur, &i))) 2080 goto error0; 2081 if (XFS_IS_CORRUPT(mp, i != 1)) { 2082 xfs_btree_mark_sick(cnt_cur); 2083 error = -EFSCORRUPTED; 2084 goto error0; 2085 } 2086 /* 2087 * Delete the old by-block entry for the right block. 2088 */ 2089 if ((error = xfs_btree_delete(bno_cur, &i))) 2090 goto error0; 2091 if (XFS_IS_CORRUPT(mp, i != 1)) { 2092 xfs_btree_mark_sick(bno_cur); 2093 error = -EFSCORRUPTED; 2094 goto error0; 2095 } 2096 /* 2097 * Move the by-block cursor back to the left neighbor. 2098 */ 2099 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 2100 goto error0; 2101 if (XFS_IS_CORRUPT(mp, i != 1)) { 2102 xfs_btree_mark_sick(bno_cur); 2103 error = -EFSCORRUPTED; 2104 goto error0; 2105 } 2106 #ifdef DEBUG 2107 /* 2108 * Check that this is the right record: delete didn't 2109 * mangle the cursor. 2110 */ 2111 { 2112 xfs_agblock_t xxbno; 2113 xfs_extlen_t xxlen; 2114 2115 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen, 2116 &i))) 2117 goto error0; 2118 if (XFS_IS_CORRUPT(mp, 2119 i != 1 || 2120 xxbno != ltbno || 2121 xxlen != ltlen)) { 2122 xfs_btree_mark_sick(bno_cur); 2123 error = -EFSCORRUPTED; 2124 goto error0; 2125 } 2126 } 2127 #endif 2128 /* 2129 * Update remaining by-block entry to the new, joined block. 2130 */ 2131 nbno = ltbno; 2132 nlen = len + ltlen + gtlen; 2133 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2134 goto error0; 2135 } 2136 /* 2137 * Have only a left contiguous neighbor. 2138 * Merge it together with the new freespace. 2139 */ 2140 else if (haveleft) { 2141 /* 2142 * Delete the old by-size entry on the left. 2143 */ 2144 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i))) 2145 goto error0; 2146 if (XFS_IS_CORRUPT(mp, i != 1)) { 2147 xfs_btree_mark_sick(cnt_cur); 2148 error = -EFSCORRUPTED; 2149 goto error0; 2150 } 2151 if ((error = xfs_btree_delete(cnt_cur, &i))) 2152 goto error0; 2153 if (XFS_IS_CORRUPT(mp, i != 1)) { 2154 xfs_btree_mark_sick(cnt_cur); 2155 error = -EFSCORRUPTED; 2156 goto error0; 2157 } 2158 /* 2159 * Back up the by-block cursor to the left neighbor, and 2160 * update its length. 2161 */ 2162 if ((error = xfs_btree_decrement(bno_cur, 0, &i))) 2163 goto error0; 2164 if (XFS_IS_CORRUPT(mp, i != 1)) { 2165 xfs_btree_mark_sick(bno_cur); 2166 error = -EFSCORRUPTED; 2167 goto error0; 2168 } 2169 nbno = ltbno; 2170 nlen = len + ltlen; 2171 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2172 goto error0; 2173 } 2174 /* 2175 * Have only a right contiguous neighbor. 2176 * Merge it together with the new freespace. 2177 */ 2178 else if (haveright) { 2179 /* 2180 * Delete the old by-size entry on the right. 2181 */ 2182 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i))) 2183 goto error0; 2184 if (XFS_IS_CORRUPT(mp, i != 1)) { 2185 xfs_btree_mark_sick(cnt_cur); 2186 error = -EFSCORRUPTED; 2187 goto error0; 2188 } 2189 if ((error = xfs_btree_delete(cnt_cur, &i))) 2190 goto error0; 2191 if (XFS_IS_CORRUPT(mp, i != 1)) { 2192 xfs_btree_mark_sick(cnt_cur); 2193 error = -EFSCORRUPTED; 2194 goto error0; 2195 } 2196 /* 2197 * Update the starting block and length of the right 2198 * neighbor in the by-block tree. 2199 */ 2200 nbno = bno; 2201 nlen = len + gtlen; 2202 if ((error = xfs_alloc_update(bno_cur, nbno, nlen))) 2203 goto error0; 2204 } 2205 /* 2206 * No contiguous neighbors. 2207 * Insert the new freespace into the by-block tree. 2208 */ 2209 else { 2210 nbno = bno; 2211 nlen = len; 2212 if ((error = xfs_btree_insert(bno_cur, &i))) 2213 goto error0; 2214 if (XFS_IS_CORRUPT(mp, i != 1)) { 2215 xfs_btree_mark_sick(bno_cur); 2216 error = -EFSCORRUPTED; 2217 goto error0; 2218 } 2219 } 2220 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR); 2221 bno_cur = NULL; 2222 /* 2223 * In all cases we need to insert the new freespace in the by-size tree. 2224 */ 2225 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i))) 2226 goto error0; 2227 if (XFS_IS_CORRUPT(mp, i != 0)) { 2228 xfs_btree_mark_sick(cnt_cur); 2229 error = -EFSCORRUPTED; 2230 goto error0; 2231 } 2232 if ((error = xfs_btree_insert(cnt_cur, &i))) 2233 goto error0; 2234 if (XFS_IS_CORRUPT(mp, i != 1)) { 2235 xfs_btree_mark_sick(cnt_cur); 2236 error = -EFSCORRUPTED; 2237 goto error0; 2238 } 2239 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR); 2240 cnt_cur = NULL; 2241 2242 /* 2243 * Update the freespace totals in the ag and superblock. 2244 */ 2245 error = xfs_alloc_update_counters(tp, agbp, len); 2246 xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len); 2247 if (error) 2248 goto error0; 2249 2250 XFS_STATS_INC(mp, xs_freex); 2251 XFS_STATS_ADD(mp, xs_freeb, len); 2252 2253 trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright); 2254 2255 return 0; 2256 2257 error0: 2258 trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1); 2259 if (bno_cur) 2260 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR); 2261 if (cnt_cur) 2262 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR); 2263 return error; 2264 } 2265 2266 /* 2267 * Visible (exported) allocation/free functions. 2268 * Some of these are used just by xfs_alloc_btree.c and this file. 2269 */ 2270 2271 /* 2272 * Compute and fill in value of m_alloc_maxlevels. 2273 */ 2274 void 2275 xfs_alloc_compute_maxlevels( 2276 xfs_mount_t *mp) /* file system mount structure */ 2277 { 2278 mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr, 2279 (mp->m_sb.sb_agblocks + 1) / 2); 2280 ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk()); 2281 } 2282 2283 /* 2284 * Find the length of the longest extent in an AG. The 'need' parameter 2285 * specifies how much space we're going to need for the AGFL and the 2286 * 'reserved' parameter tells us how many blocks in this AG are reserved for 2287 * other callers. 2288 */ 2289 xfs_extlen_t 2290 xfs_alloc_longest_free_extent( 2291 struct xfs_perag *pag, 2292 xfs_extlen_t need, 2293 xfs_extlen_t reserved) 2294 { 2295 xfs_extlen_t delta = 0; 2296 2297 /* 2298 * If the AGFL needs a recharge, we'll have to subtract that from the 2299 * longest extent. 2300 */ 2301 if (need > pag->pagf_flcount) 2302 delta = need - pag->pagf_flcount; 2303 2304 /* 2305 * If we cannot maintain others' reservations with space from the 2306 * not-longest freesp extents, we'll have to subtract /that/ from 2307 * the longest extent too. 2308 */ 2309 if (pag->pagf_freeblks - pag->pagf_longest < reserved) 2310 delta += reserved - (pag->pagf_freeblks - pag->pagf_longest); 2311 2312 /* 2313 * If the longest extent is long enough to satisfy all the 2314 * reservations and AGFL rules in place, we can return this extent. 2315 */ 2316 if (pag->pagf_longest > delta) 2317 return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable, 2318 pag->pagf_longest - delta); 2319 2320 /* Otherwise, let the caller try for 1 block if there's space. */ 2321 return pag->pagf_flcount > 0 || pag->pagf_longest > 0; 2322 } 2323 2324 /* 2325 * Compute the minimum length of the AGFL in the given AG. If @pag is NULL, 2326 * return the largest possible minimum length. 2327 */ 2328 unsigned int 2329 xfs_alloc_min_freelist( 2330 struct xfs_mount *mp, 2331 struct xfs_perag *pag) 2332 { 2333 /* AG btrees have at least 1 level. */ 2334 const unsigned int bno_level = pag ? pag->pagf_bno_level : 1; 2335 const unsigned int cnt_level = pag ? pag->pagf_cnt_level : 1; 2336 const unsigned int rmap_level = pag ? pag->pagf_rmap_level : 1; 2337 unsigned int min_free; 2338 2339 ASSERT(mp->m_alloc_maxlevels > 0); 2340 2341 /* 2342 * For a btree shorter than the maximum height, the worst case is that 2343 * every level gets split and a new level is added, then while inserting 2344 * another entry to refill the AGFL, every level under the old root gets 2345 * split again. This is: 2346 * 2347 * (full height split reservation) + (AGFL refill split height) 2348 * = (current height + 1) + (current height - 1) 2349 * = (new height) + (new height - 2) 2350 * = 2 * new height - 2 2351 * 2352 * For a btree of maximum height, the worst case is that every level 2353 * under the root gets split, then while inserting another entry to 2354 * refill the AGFL, every level under the root gets split again. This is 2355 * also: 2356 * 2357 * 2 * (current height - 1) 2358 * = 2 * (new height - 1) 2359 * = 2 * new height - 2 2360 */ 2361 2362 /* space needed by-bno freespace btree */ 2363 min_free = min(bno_level + 1, mp->m_alloc_maxlevels) * 2 - 2; 2364 /* space needed by-size freespace btree */ 2365 min_free += min(cnt_level + 1, mp->m_alloc_maxlevels) * 2 - 2; 2366 /* space needed reverse mapping used space btree */ 2367 if (xfs_has_rmapbt(mp)) 2368 min_free += min(rmap_level + 1, mp->m_rmap_maxlevels) * 2 - 2; 2369 return min_free; 2370 } 2371 2372 /* 2373 * Check if the operation we are fixing up the freelist for should go ahead or 2374 * not. If we are freeing blocks, we always allow it, otherwise the allocation 2375 * is dependent on whether the size and shape of free space available will 2376 * permit the requested allocation to take place. 2377 */ 2378 static bool 2379 xfs_alloc_space_available( 2380 struct xfs_alloc_arg *args, 2381 xfs_extlen_t min_free, 2382 int flags) 2383 { 2384 struct xfs_perag *pag = args->pag; 2385 xfs_extlen_t alloc_len, longest; 2386 xfs_extlen_t reservation; /* blocks that are still reserved */ 2387 int available; 2388 xfs_extlen_t agflcount; 2389 2390 if (flags & XFS_ALLOC_FLAG_FREEING) 2391 return true; 2392 2393 reservation = xfs_ag_resv_needed(pag, args->resv); 2394 2395 /* do we have enough contiguous free space for the allocation? */ 2396 alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop; 2397 longest = xfs_alloc_longest_free_extent(pag, min_free, reservation); 2398 if (longest < alloc_len) 2399 return false; 2400 2401 /* 2402 * Do we have enough free space remaining for the allocation? Don't 2403 * account extra agfl blocks because we are about to defer free them, 2404 * making them unavailable until the current transaction commits. 2405 */ 2406 agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free); 2407 available = (int)(pag->pagf_freeblks + agflcount - 2408 reservation - min_free - args->minleft); 2409 if (available < (int)max(args->total, alloc_len)) 2410 return false; 2411 2412 /* 2413 * Clamp maxlen to the amount of free space available for the actual 2414 * extent allocation. 2415 */ 2416 if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) { 2417 args->maxlen = available; 2418 ASSERT(args->maxlen > 0); 2419 ASSERT(args->maxlen >= args->minlen); 2420 } 2421 2422 return true; 2423 } 2424 2425 int 2426 xfs_free_agfl_block( 2427 struct xfs_trans *tp, 2428 xfs_agnumber_t agno, 2429 xfs_agblock_t agbno, 2430 struct xfs_buf *agbp, 2431 struct xfs_owner_info *oinfo) 2432 { 2433 int error; 2434 struct xfs_buf *bp; 2435 2436 error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo, 2437 XFS_AG_RESV_AGFL); 2438 if (error) 2439 return error; 2440 2441 error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp, 2442 XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno), 2443 tp->t_mountp->m_bsize, 0, &bp); 2444 if (error) 2445 return error; 2446 xfs_trans_binval(tp, bp); 2447 2448 return 0; 2449 } 2450 2451 /* 2452 * Check the agfl fields of the agf for inconsistency or corruption. 2453 * 2454 * The original purpose was to detect an agfl header padding mismatch between 2455 * current and early v5 kernels. This problem manifests as a 1-slot size 2456 * difference between the on-disk flcount and the active [first, last] range of 2457 * a wrapped agfl. 2458 * 2459 * However, we need to use these same checks to catch agfl count corruptions 2460 * unrelated to padding. This could occur on any v4 or v5 filesystem, so either 2461 * way, we need to reset the agfl and warn the user. 2462 * 2463 * Return true if a reset is required before the agfl can be used, false 2464 * otherwise. 2465 */ 2466 static bool 2467 xfs_agfl_needs_reset( 2468 struct xfs_mount *mp, 2469 struct xfs_agf *agf) 2470 { 2471 uint32_t f = be32_to_cpu(agf->agf_flfirst); 2472 uint32_t l = be32_to_cpu(agf->agf_fllast); 2473 uint32_t c = be32_to_cpu(agf->agf_flcount); 2474 int agfl_size = xfs_agfl_size(mp); 2475 int active; 2476 2477 /* 2478 * The agf read verifier catches severe corruption of these fields. 2479 * Repeat some sanity checks to cover a packed -> unpacked mismatch if 2480 * the verifier allows it. 2481 */ 2482 if (f >= agfl_size || l >= agfl_size) 2483 return true; 2484 if (c > agfl_size) 2485 return true; 2486 2487 /* 2488 * Check consistency between the on-disk count and the active range. An 2489 * agfl padding mismatch manifests as an inconsistent flcount. 2490 */ 2491 if (c && l >= f) 2492 active = l - f + 1; 2493 else if (c) 2494 active = agfl_size - f + l + 1; 2495 else 2496 active = 0; 2497 2498 return active != c; 2499 } 2500 2501 /* 2502 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the 2503 * agfl content cannot be trusted. Warn the user that a repair is required to 2504 * recover leaked blocks. 2505 * 2506 * The purpose of this mechanism is to handle filesystems affected by the agfl 2507 * header padding mismatch problem. A reset keeps the filesystem online with a 2508 * relatively minor free space accounting inconsistency rather than suffer the 2509 * inevitable crash from use of an invalid agfl block. 2510 */ 2511 static void 2512 xfs_agfl_reset( 2513 struct xfs_trans *tp, 2514 struct xfs_buf *agbp, 2515 struct xfs_perag *pag) 2516 { 2517 struct xfs_mount *mp = tp->t_mountp; 2518 struct xfs_agf *agf = agbp->b_addr; 2519 2520 ASSERT(xfs_perag_agfl_needs_reset(pag)); 2521 trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_); 2522 2523 xfs_warn(mp, 2524 "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. " 2525 "Please unmount and run xfs_repair.", 2526 pag->pag_agno, pag->pagf_flcount); 2527 2528 agf->agf_flfirst = 0; 2529 agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1); 2530 agf->agf_flcount = 0; 2531 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST | 2532 XFS_AGF_FLCOUNT); 2533 2534 pag->pagf_flcount = 0; 2535 clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); 2536 } 2537 2538 /* 2539 * Defer an AGFL block free. This is effectively equivalent to 2540 * xfs_free_extent_later() with some special handling particular to AGFL blocks. 2541 * 2542 * Deferring AGFL frees helps prevent log reservation overruns due to too many 2543 * allocation operations in a transaction. AGFL frees are prone to this problem 2544 * because for one they are always freed one at a time. Further, an immediate 2545 * AGFL block free can cause a btree join and require another block free before 2546 * the real allocation can proceed. Deferring the free disconnects freeing up 2547 * the AGFL slot from freeing the block. 2548 */ 2549 static int 2550 xfs_defer_agfl_block( 2551 struct xfs_trans *tp, 2552 xfs_agnumber_t agno, 2553 xfs_agblock_t agbno, 2554 struct xfs_owner_info *oinfo) 2555 { 2556 struct xfs_mount *mp = tp->t_mountp; 2557 struct xfs_extent_free_item *xefi; 2558 xfs_fsblock_t fsbno = XFS_AGB_TO_FSB(mp, agno, agbno); 2559 2560 ASSERT(xfs_extfree_item_cache != NULL); 2561 ASSERT(oinfo != NULL); 2562 2563 if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbno(mp, fsbno))) 2564 return -EFSCORRUPTED; 2565 2566 xefi = kmem_cache_zalloc(xfs_extfree_item_cache, 2567 GFP_KERNEL | __GFP_NOFAIL); 2568 xefi->xefi_startblock = fsbno; 2569 xefi->xefi_blockcount = 1; 2570 xefi->xefi_owner = oinfo->oi_owner; 2571 xefi->xefi_agresv = XFS_AG_RESV_AGFL; 2572 2573 trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1); 2574 2575 xfs_extent_free_get_group(mp, xefi); 2576 xfs_defer_add(tp, &xefi->xefi_list, &xfs_agfl_free_defer_type); 2577 return 0; 2578 } 2579 2580 /* 2581 * Add the extent to the list of extents to be free at transaction end. 2582 * The list is maintained sorted (by block number). 2583 */ 2584 static int 2585 xfs_defer_extent_free( 2586 struct xfs_trans *tp, 2587 xfs_fsblock_t bno, 2588 xfs_filblks_t len, 2589 const struct xfs_owner_info *oinfo, 2590 enum xfs_ag_resv_type type, 2591 bool skip_discard, 2592 struct xfs_defer_pending **dfpp) 2593 { 2594 struct xfs_extent_free_item *xefi; 2595 struct xfs_mount *mp = tp->t_mountp; 2596 #ifdef DEBUG 2597 xfs_agnumber_t agno; 2598 xfs_agblock_t agbno; 2599 2600 ASSERT(bno != NULLFSBLOCK); 2601 ASSERT(len > 0); 2602 ASSERT(len <= XFS_MAX_BMBT_EXTLEN); 2603 ASSERT(!isnullstartblock(bno)); 2604 agno = XFS_FSB_TO_AGNO(mp, bno); 2605 agbno = XFS_FSB_TO_AGBNO(mp, bno); 2606 ASSERT(agno < mp->m_sb.sb_agcount); 2607 ASSERT(agbno < mp->m_sb.sb_agblocks); 2608 ASSERT(len < mp->m_sb.sb_agblocks); 2609 ASSERT(agbno + len <= mp->m_sb.sb_agblocks); 2610 #endif 2611 ASSERT(xfs_extfree_item_cache != NULL); 2612 ASSERT(type != XFS_AG_RESV_AGFL); 2613 2614 if (XFS_IS_CORRUPT(mp, !xfs_verify_fsbext(mp, bno, len))) 2615 return -EFSCORRUPTED; 2616 2617 xefi = kmem_cache_zalloc(xfs_extfree_item_cache, 2618 GFP_KERNEL | __GFP_NOFAIL); 2619 xefi->xefi_startblock = bno; 2620 xefi->xefi_blockcount = (xfs_extlen_t)len; 2621 xefi->xefi_agresv = type; 2622 if (skip_discard) 2623 xefi->xefi_flags |= XFS_EFI_SKIP_DISCARD; 2624 if (oinfo) { 2625 ASSERT(oinfo->oi_offset == 0); 2626 2627 if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK) 2628 xefi->xefi_flags |= XFS_EFI_ATTR_FORK; 2629 if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK) 2630 xefi->xefi_flags |= XFS_EFI_BMBT_BLOCK; 2631 xefi->xefi_owner = oinfo->oi_owner; 2632 } else { 2633 xefi->xefi_owner = XFS_RMAP_OWN_NULL; 2634 } 2635 trace_xfs_bmap_free_defer(mp, 2636 XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0, 2637 XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len); 2638 2639 xfs_extent_free_get_group(mp, xefi); 2640 *dfpp = xfs_defer_add(tp, &xefi->xefi_list, &xfs_extent_free_defer_type); 2641 return 0; 2642 } 2643 2644 int 2645 xfs_free_extent_later( 2646 struct xfs_trans *tp, 2647 xfs_fsblock_t bno, 2648 xfs_filblks_t len, 2649 const struct xfs_owner_info *oinfo, 2650 enum xfs_ag_resv_type type, 2651 bool skip_discard) 2652 { 2653 struct xfs_defer_pending *dontcare = NULL; 2654 2655 return xfs_defer_extent_free(tp, bno, len, oinfo, type, skip_discard, 2656 &dontcare); 2657 } 2658 2659 /* 2660 * Set up automatic freeing of unwritten space in the filesystem. 2661 * 2662 * This function attached a paused deferred extent free item to the 2663 * transaction. Pausing means that the EFI will be logged in the next 2664 * transaction commit, but the pending EFI will not be finished until the 2665 * pending item is unpaused. 2666 * 2667 * If the system goes down after the EFI has been persisted to the log but 2668 * before the pending item is unpaused, log recovery will find the EFI, fail to 2669 * find the EFD, and free the space. 2670 * 2671 * If the pending item is unpaused, the next transaction commit will log an EFD 2672 * without freeing the space. 2673 * 2674 * Caller must ensure that the tp, fsbno, len, oinfo, and resv flags of the 2675 * @args structure are set to the relevant values. 2676 */ 2677 int 2678 xfs_alloc_schedule_autoreap( 2679 const struct xfs_alloc_arg *args, 2680 bool skip_discard, 2681 struct xfs_alloc_autoreap *aarp) 2682 { 2683 int error; 2684 2685 error = xfs_defer_extent_free(args->tp, args->fsbno, args->len, 2686 &args->oinfo, args->resv, skip_discard, &aarp->dfp); 2687 if (error) 2688 return error; 2689 2690 xfs_defer_item_pause(args->tp, aarp->dfp); 2691 return 0; 2692 } 2693 2694 /* 2695 * Cancel automatic freeing of unwritten space in the filesystem. 2696 * 2697 * Earlier, we created a paused deferred extent free item and attached it to 2698 * this transaction so that we could automatically roll back a new space 2699 * allocation if the system went down. Now we want to cancel the paused work 2700 * item by marking the EFI stale so we don't actually free the space, unpausing 2701 * the pending item and logging an EFD. 2702 * 2703 * The caller generally should have already mapped the space into the ondisk 2704 * filesystem. If the reserved space was partially used, the caller must call 2705 * xfs_free_extent_later to create a new EFI to free the unused space. 2706 */ 2707 void 2708 xfs_alloc_cancel_autoreap( 2709 struct xfs_trans *tp, 2710 struct xfs_alloc_autoreap *aarp) 2711 { 2712 struct xfs_defer_pending *dfp = aarp->dfp; 2713 struct xfs_extent_free_item *xefi; 2714 2715 if (!dfp) 2716 return; 2717 2718 list_for_each_entry(xefi, &dfp->dfp_work, xefi_list) 2719 xefi->xefi_flags |= XFS_EFI_CANCELLED; 2720 2721 xfs_defer_item_unpause(tp, dfp); 2722 } 2723 2724 /* 2725 * Commit automatic freeing of unwritten space in the filesystem. 2726 * 2727 * This unpauses an earlier _schedule_autoreap and commits to freeing the 2728 * allocated space. Call this if none of the reserved space was used. 2729 */ 2730 void 2731 xfs_alloc_commit_autoreap( 2732 struct xfs_trans *tp, 2733 struct xfs_alloc_autoreap *aarp) 2734 { 2735 if (aarp->dfp) 2736 xfs_defer_item_unpause(tp, aarp->dfp); 2737 } 2738 2739 #ifdef DEBUG 2740 /* 2741 * Check if an AGF has a free extent record whose length is equal to 2742 * args->minlen. 2743 */ 2744 STATIC int 2745 xfs_exact_minlen_extent_available( 2746 struct xfs_alloc_arg *args, 2747 struct xfs_buf *agbp, 2748 int *stat) 2749 { 2750 struct xfs_btree_cur *cnt_cur; 2751 xfs_agblock_t fbno; 2752 xfs_extlen_t flen; 2753 int error = 0; 2754 2755 cnt_cur = xfs_cntbt_init_cursor(args->mp, args->tp, agbp, 2756 args->pag); 2757 error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat); 2758 if (error) 2759 goto out; 2760 2761 if (*stat == 0) { 2762 xfs_btree_mark_sick(cnt_cur); 2763 error = -EFSCORRUPTED; 2764 goto out; 2765 } 2766 2767 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat); 2768 if (error) 2769 goto out; 2770 2771 if (*stat == 1 && flen != args->minlen) 2772 *stat = 0; 2773 2774 out: 2775 xfs_btree_del_cursor(cnt_cur, error); 2776 2777 return error; 2778 } 2779 #endif 2780 2781 /* 2782 * Decide whether to use this allocation group for this allocation. 2783 * If so, fix up the btree freelist's size. 2784 */ 2785 int /* error */ 2786 xfs_alloc_fix_freelist( 2787 struct xfs_alloc_arg *args, /* allocation argument structure */ 2788 uint32_t alloc_flags) 2789 { 2790 struct xfs_mount *mp = args->mp; 2791 struct xfs_perag *pag = args->pag; 2792 struct xfs_trans *tp = args->tp; 2793 struct xfs_buf *agbp = NULL; 2794 struct xfs_buf *agflbp = NULL; 2795 struct xfs_alloc_arg targs; /* local allocation arguments */ 2796 xfs_agblock_t bno; /* freelist block */ 2797 xfs_extlen_t need; /* total blocks needed in freelist */ 2798 int error = 0; 2799 2800 /* deferred ops (AGFL block frees) require permanent transactions */ 2801 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES); 2802 2803 if (!xfs_perag_initialised_agf(pag)) { 2804 error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp); 2805 if (error) { 2806 /* Couldn't lock the AGF so skip this AG. */ 2807 if (error == -EAGAIN) 2808 error = 0; 2809 goto out_no_agbp; 2810 } 2811 } 2812 2813 /* 2814 * If this is a metadata preferred pag and we are user data then try 2815 * somewhere else if we are not being asked to try harder at this 2816 * point 2817 */ 2818 if (xfs_perag_prefers_metadata(pag) && 2819 (args->datatype & XFS_ALLOC_USERDATA) && 2820 (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK)) { 2821 ASSERT(!(alloc_flags & XFS_ALLOC_FLAG_FREEING)); 2822 goto out_agbp_relse; 2823 } 2824 2825 need = xfs_alloc_min_freelist(mp, pag); 2826 if (!xfs_alloc_space_available(args, need, alloc_flags | 2827 XFS_ALLOC_FLAG_CHECK)) 2828 goto out_agbp_relse; 2829 2830 /* 2831 * Get the a.g. freespace buffer. 2832 * Can fail if we're not blocking on locks, and it's held. 2833 */ 2834 if (!agbp) { 2835 error = xfs_alloc_read_agf(pag, tp, alloc_flags, &agbp); 2836 if (error) { 2837 /* Couldn't lock the AGF so skip this AG. */ 2838 if (error == -EAGAIN) 2839 error = 0; 2840 goto out_no_agbp; 2841 } 2842 } 2843 2844 /* reset a padding mismatched agfl before final free space check */ 2845 if (xfs_perag_agfl_needs_reset(pag)) 2846 xfs_agfl_reset(tp, agbp, pag); 2847 2848 /* If there isn't enough total space or single-extent, reject it. */ 2849 need = xfs_alloc_min_freelist(mp, pag); 2850 if (!xfs_alloc_space_available(args, need, alloc_flags)) 2851 goto out_agbp_relse; 2852 2853 #ifdef DEBUG 2854 if (args->alloc_minlen_only) { 2855 int stat; 2856 2857 error = xfs_exact_minlen_extent_available(args, agbp, &stat); 2858 if (error || !stat) 2859 goto out_agbp_relse; 2860 } 2861 #endif 2862 /* 2863 * Make the freelist shorter if it's too long. 2864 * 2865 * Note that from this point onwards, we will always release the agf and 2866 * agfl buffers on error. This handles the case where we error out and 2867 * the buffers are clean or may not have been joined to the transaction 2868 * and hence need to be released manually. If they have been joined to 2869 * the transaction, then xfs_trans_brelse() will handle them 2870 * appropriately based on the recursion count and dirty state of the 2871 * buffer. 2872 * 2873 * XXX (dgc): When we have lots of free space, does this buy us 2874 * anything other than extra overhead when we need to put more blocks 2875 * back on the free list? Maybe we should only do this when space is 2876 * getting low or the AGFL is more than half full? 2877 * 2878 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too 2879 * big; the NORMAP flag prevents AGFL expand/shrink operations from 2880 * updating the rmapbt. Both flags are used in xfs_repair while we're 2881 * rebuilding the rmapbt, and neither are used by the kernel. They're 2882 * both required to ensure that rmaps are correctly recorded for the 2883 * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and 2884 * repair/rmap.c in xfsprogs for details. 2885 */ 2886 memset(&targs, 0, sizeof(targs)); 2887 /* struct copy below */ 2888 if (alloc_flags & XFS_ALLOC_FLAG_NORMAP) 2889 targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE; 2890 else 2891 targs.oinfo = XFS_RMAP_OINFO_AG; 2892 while (!(alloc_flags & XFS_ALLOC_FLAG_NOSHRINK) && 2893 pag->pagf_flcount > need) { 2894 error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0); 2895 if (error) 2896 goto out_agbp_relse; 2897 2898 /* defer agfl frees */ 2899 error = xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo); 2900 if (error) 2901 goto out_agbp_relse; 2902 } 2903 2904 targs.tp = tp; 2905 targs.mp = mp; 2906 targs.agbp = agbp; 2907 targs.agno = args->agno; 2908 targs.alignment = targs.minlen = targs.prod = 1; 2909 targs.pag = pag; 2910 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 2911 if (error) 2912 goto out_agbp_relse; 2913 2914 /* Make the freelist longer if it's too short. */ 2915 while (pag->pagf_flcount < need) { 2916 targs.agbno = 0; 2917 targs.maxlen = need - pag->pagf_flcount; 2918 targs.resv = XFS_AG_RESV_AGFL; 2919 2920 /* Allocate as many blocks as possible at once. */ 2921 error = xfs_alloc_ag_vextent_size(&targs, alloc_flags); 2922 if (error) 2923 goto out_agflbp_relse; 2924 2925 /* 2926 * Stop if we run out. Won't happen if callers are obeying 2927 * the restrictions correctly. Can happen for free calls 2928 * on a completely full ag. 2929 */ 2930 if (targs.agbno == NULLAGBLOCK) { 2931 if (alloc_flags & XFS_ALLOC_FLAG_FREEING) 2932 break; 2933 goto out_agflbp_relse; 2934 } 2935 2936 if (!xfs_rmap_should_skip_owner_update(&targs.oinfo)) { 2937 error = xfs_rmap_alloc(tp, agbp, pag, 2938 targs.agbno, targs.len, &targs.oinfo); 2939 if (error) 2940 goto out_agflbp_relse; 2941 } 2942 error = xfs_alloc_update_counters(tp, agbp, 2943 -((long)(targs.len))); 2944 if (error) 2945 goto out_agflbp_relse; 2946 2947 /* 2948 * Put each allocated block on the list. 2949 */ 2950 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) { 2951 error = xfs_alloc_put_freelist(pag, tp, agbp, 2952 agflbp, bno, 0); 2953 if (error) 2954 goto out_agflbp_relse; 2955 } 2956 } 2957 xfs_trans_brelse(tp, agflbp); 2958 args->agbp = agbp; 2959 return 0; 2960 2961 out_agflbp_relse: 2962 xfs_trans_brelse(tp, agflbp); 2963 out_agbp_relse: 2964 if (agbp) 2965 xfs_trans_brelse(tp, agbp); 2966 out_no_agbp: 2967 args->agbp = NULL; 2968 return error; 2969 } 2970 2971 /* 2972 * Get a block from the freelist. 2973 * Returns with the buffer for the block gotten. 2974 */ 2975 int 2976 xfs_alloc_get_freelist( 2977 struct xfs_perag *pag, 2978 struct xfs_trans *tp, 2979 struct xfs_buf *agbp, 2980 xfs_agblock_t *bnop, 2981 int btreeblk) 2982 { 2983 struct xfs_agf *agf = agbp->b_addr; 2984 struct xfs_buf *agflbp; 2985 xfs_agblock_t bno; 2986 __be32 *agfl_bno; 2987 int error; 2988 uint32_t logflags; 2989 struct xfs_mount *mp = tp->t_mountp; 2990 2991 /* 2992 * Freelist is empty, give up. 2993 */ 2994 if (!agf->agf_flcount) { 2995 *bnop = NULLAGBLOCK; 2996 return 0; 2997 } 2998 /* 2999 * Read the array of free blocks. 3000 */ 3001 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 3002 if (error) 3003 return error; 3004 3005 3006 /* 3007 * Get the block number and update the data structures. 3008 */ 3009 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 3010 bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]); 3011 if (XFS_IS_CORRUPT(tp->t_mountp, !xfs_verify_agbno(pag, bno))) 3012 return -EFSCORRUPTED; 3013 3014 be32_add_cpu(&agf->agf_flfirst, 1); 3015 xfs_trans_brelse(tp, agflbp); 3016 if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp)) 3017 agf->agf_flfirst = 0; 3018 3019 ASSERT(!xfs_perag_agfl_needs_reset(pag)); 3020 be32_add_cpu(&agf->agf_flcount, -1); 3021 pag->pagf_flcount--; 3022 3023 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT; 3024 if (btreeblk) { 3025 be32_add_cpu(&agf->agf_btreeblks, 1); 3026 pag->pagf_btreeblks++; 3027 logflags |= XFS_AGF_BTREEBLKS; 3028 } 3029 3030 xfs_alloc_log_agf(tp, agbp, logflags); 3031 *bnop = bno; 3032 3033 return 0; 3034 } 3035 3036 /* 3037 * Log the given fields from the agf structure. 3038 */ 3039 void 3040 xfs_alloc_log_agf( 3041 struct xfs_trans *tp, 3042 struct xfs_buf *bp, 3043 uint32_t fields) 3044 { 3045 int first; /* first byte offset */ 3046 int last; /* last byte offset */ 3047 static const short offsets[] = { 3048 offsetof(xfs_agf_t, agf_magicnum), 3049 offsetof(xfs_agf_t, agf_versionnum), 3050 offsetof(xfs_agf_t, agf_seqno), 3051 offsetof(xfs_agf_t, agf_length), 3052 offsetof(xfs_agf_t, agf_bno_root), /* also cnt/rmap root */ 3053 offsetof(xfs_agf_t, agf_bno_level), /* also cnt/rmap levels */ 3054 offsetof(xfs_agf_t, agf_flfirst), 3055 offsetof(xfs_agf_t, agf_fllast), 3056 offsetof(xfs_agf_t, agf_flcount), 3057 offsetof(xfs_agf_t, agf_freeblks), 3058 offsetof(xfs_agf_t, agf_longest), 3059 offsetof(xfs_agf_t, agf_btreeblks), 3060 offsetof(xfs_agf_t, agf_uuid), 3061 offsetof(xfs_agf_t, agf_rmap_blocks), 3062 offsetof(xfs_agf_t, agf_refcount_blocks), 3063 offsetof(xfs_agf_t, agf_refcount_root), 3064 offsetof(xfs_agf_t, agf_refcount_level), 3065 /* needed so that we don't log the whole rest of the structure: */ 3066 offsetof(xfs_agf_t, agf_spare64), 3067 sizeof(xfs_agf_t) 3068 }; 3069 3070 trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_); 3071 3072 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF); 3073 3074 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last); 3075 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last); 3076 } 3077 3078 /* 3079 * Put the block on the freelist for the allocation group. 3080 */ 3081 int 3082 xfs_alloc_put_freelist( 3083 struct xfs_perag *pag, 3084 struct xfs_trans *tp, 3085 struct xfs_buf *agbp, 3086 struct xfs_buf *agflbp, 3087 xfs_agblock_t bno, 3088 int btreeblk) 3089 { 3090 struct xfs_mount *mp = tp->t_mountp; 3091 struct xfs_agf *agf = agbp->b_addr; 3092 __be32 *blockp; 3093 int error; 3094 uint32_t logflags; 3095 __be32 *agfl_bno; 3096 int startoff; 3097 3098 if (!agflbp) { 3099 error = xfs_alloc_read_agfl(pag, tp, &agflbp); 3100 if (error) 3101 return error; 3102 } 3103 3104 be32_add_cpu(&agf->agf_fllast, 1); 3105 if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp)) 3106 agf->agf_fllast = 0; 3107 3108 ASSERT(!xfs_perag_agfl_needs_reset(pag)); 3109 be32_add_cpu(&agf->agf_flcount, 1); 3110 pag->pagf_flcount++; 3111 3112 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT; 3113 if (btreeblk) { 3114 be32_add_cpu(&agf->agf_btreeblks, -1); 3115 pag->pagf_btreeblks--; 3116 logflags |= XFS_AGF_BTREEBLKS; 3117 } 3118 3119 xfs_alloc_log_agf(tp, agbp, logflags); 3120 3121 ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)); 3122 3123 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 3124 blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)]; 3125 *blockp = cpu_to_be32(bno); 3126 startoff = (char *)blockp - (char *)agflbp->b_addr; 3127 3128 xfs_alloc_log_agf(tp, agbp, logflags); 3129 3130 xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF); 3131 xfs_trans_log_buf(tp, agflbp, startoff, 3132 startoff + sizeof(xfs_agblock_t) - 1); 3133 return 0; 3134 } 3135 3136 /* 3137 * Check that this AGF/AGI header's sequence number and length matches the AG 3138 * number and size in fsblocks. 3139 */ 3140 xfs_failaddr_t 3141 xfs_validate_ag_length( 3142 struct xfs_buf *bp, 3143 uint32_t seqno, 3144 uint32_t length) 3145 { 3146 struct xfs_mount *mp = bp->b_mount; 3147 /* 3148 * During growfs operations, the perag is not fully initialised, 3149 * so we can't use it for any useful checking. growfs ensures we can't 3150 * use it by using uncached buffers that don't have the perag attached 3151 * so we can detect and avoid this problem. 3152 */ 3153 if (bp->b_pag && seqno != bp->b_pag->pag_agno) 3154 return __this_address; 3155 3156 /* 3157 * Only the last AG in the filesystem is allowed to be shorter 3158 * than the AG size recorded in the superblock. 3159 */ 3160 if (length != mp->m_sb.sb_agblocks) { 3161 /* 3162 * During growfs, the new last AG can get here before we 3163 * have updated the superblock. Give it a pass on the seqno 3164 * check. 3165 */ 3166 if (bp->b_pag && seqno != mp->m_sb.sb_agcount - 1) 3167 return __this_address; 3168 if (length < XFS_MIN_AG_BLOCKS) 3169 return __this_address; 3170 if (length > mp->m_sb.sb_agblocks) 3171 return __this_address; 3172 } 3173 3174 return NULL; 3175 } 3176 3177 /* 3178 * Verify the AGF is consistent. 3179 * 3180 * We do not verify the AGFL indexes in the AGF are fully consistent here 3181 * because of issues with variable on-disk structure sizes. Instead, we check 3182 * the agfl indexes for consistency when we initialise the perag from the AGF 3183 * information after a read completes. 3184 * 3185 * If the index is inconsistent, then we mark the perag as needing an AGFL 3186 * reset. The first AGFL update performed then resets the AGFL indexes and 3187 * refills the AGFL with known good free blocks, allowing the filesystem to 3188 * continue operating normally at the cost of a few leaked free space blocks. 3189 */ 3190 static xfs_failaddr_t 3191 xfs_agf_verify( 3192 struct xfs_buf *bp) 3193 { 3194 struct xfs_mount *mp = bp->b_mount; 3195 struct xfs_agf *agf = bp->b_addr; 3196 xfs_failaddr_t fa; 3197 uint32_t agf_seqno = be32_to_cpu(agf->agf_seqno); 3198 uint32_t agf_length = be32_to_cpu(agf->agf_length); 3199 3200 if (xfs_has_crc(mp)) { 3201 if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid)) 3202 return __this_address; 3203 if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn))) 3204 return __this_address; 3205 } 3206 3207 if (!xfs_verify_magic(bp, agf->agf_magicnum)) 3208 return __this_address; 3209 3210 if (!XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum))) 3211 return __this_address; 3212 3213 /* 3214 * Both agf_seqno and agf_length need to validated before anything else 3215 * block number related in the AGF or AGFL can be checked. 3216 */ 3217 fa = xfs_validate_ag_length(bp, agf_seqno, agf_length); 3218 if (fa) 3219 return fa; 3220 3221 if (be32_to_cpu(agf->agf_flfirst) >= xfs_agfl_size(mp)) 3222 return __this_address; 3223 if (be32_to_cpu(agf->agf_fllast) >= xfs_agfl_size(mp)) 3224 return __this_address; 3225 if (be32_to_cpu(agf->agf_flcount) > xfs_agfl_size(mp)) 3226 return __this_address; 3227 3228 if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) || 3229 be32_to_cpu(agf->agf_freeblks) > agf_length) 3230 return __this_address; 3231 3232 if (be32_to_cpu(agf->agf_bno_level) < 1 || 3233 be32_to_cpu(agf->agf_cnt_level) < 1 || 3234 be32_to_cpu(agf->agf_bno_level) > mp->m_alloc_maxlevels || 3235 be32_to_cpu(agf->agf_cnt_level) > mp->m_alloc_maxlevels) 3236 return __this_address; 3237 3238 if (xfs_has_lazysbcount(mp) && 3239 be32_to_cpu(agf->agf_btreeblks) > agf_length) 3240 return __this_address; 3241 3242 if (xfs_has_rmapbt(mp)) { 3243 if (be32_to_cpu(agf->agf_rmap_blocks) > agf_length) 3244 return __this_address; 3245 3246 if (be32_to_cpu(agf->agf_rmap_level) < 1 || 3247 be32_to_cpu(agf->agf_rmap_level) > mp->m_rmap_maxlevels) 3248 return __this_address; 3249 } 3250 3251 if (xfs_has_reflink(mp)) { 3252 if (be32_to_cpu(agf->agf_refcount_blocks) > agf_length) 3253 return __this_address; 3254 3255 if (be32_to_cpu(agf->agf_refcount_level) < 1 || 3256 be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels) 3257 return __this_address; 3258 } 3259 3260 return NULL; 3261 } 3262 3263 static void 3264 xfs_agf_read_verify( 3265 struct xfs_buf *bp) 3266 { 3267 struct xfs_mount *mp = bp->b_mount; 3268 xfs_failaddr_t fa; 3269 3270 if (xfs_has_crc(mp) && 3271 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF)) 3272 xfs_verifier_error(bp, -EFSBADCRC, __this_address); 3273 else { 3274 fa = xfs_agf_verify(bp); 3275 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF)) 3276 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 3277 } 3278 } 3279 3280 static void 3281 xfs_agf_write_verify( 3282 struct xfs_buf *bp) 3283 { 3284 struct xfs_mount *mp = bp->b_mount; 3285 struct xfs_buf_log_item *bip = bp->b_log_item; 3286 struct xfs_agf *agf = bp->b_addr; 3287 xfs_failaddr_t fa; 3288 3289 fa = xfs_agf_verify(bp); 3290 if (fa) { 3291 xfs_verifier_error(bp, -EFSCORRUPTED, fa); 3292 return; 3293 } 3294 3295 if (!xfs_has_crc(mp)) 3296 return; 3297 3298 if (bip) 3299 agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn); 3300 3301 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF); 3302 } 3303 3304 const struct xfs_buf_ops xfs_agf_buf_ops = { 3305 .name = "xfs_agf", 3306 .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) }, 3307 .verify_read = xfs_agf_read_verify, 3308 .verify_write = xfs_agf_write_verify, 3309 .verify_struct = xfs_agf_verify, 3310 }; 3311 3312 /* 3313 * Read in the allocation group header (free/alloc section). 3314 */ 3315 int 3316 xfs_read_agf( 3317 struct xfs_perag *pag, 3318 struct xfs_trans *tp, 3319 int flags, 3320 struct xfs_buf **agfbpp) 3321 { 3322 struct xfs_mount *mp = pag->pag_mount; 3323 int error; 3324 3325 trace_xfs_read_agf(pag->pag_mount, pag->pag_agno); 3326 3327 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, 3328 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)), 3329 XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops); 3330 if (xfs_metadata_is_sick(error)) 3331 xfs_ag_mark_sick(pag, XFS_SICK_AG_AGF); 3332 if (error) 3333 return error; 3334 3335 xfs_buf_set_ref(*agfbpp, XFS_AGF_REF); 3336 return 0; 3337 } 3338 3339 /* 3340 * Read in the allocation group header (free/alloc section) and initialise the 3341 * perag structure if necessary. If the caller provides @agfbpp, then return the 3342 * locked buffer to the caller, otherwise free it. 3343 */ 3344 int 3345 xfs_alloc_read_agf( 3346 struct xfs_perag *pag, 3347 struct xfs_trans *tp, 3348 int flags, 3349 struct xfs_buf **agfbpp) 3350 { 3351 struct xfs_buf *agfbp; 3352 struct xfs_agf *agf; 3353 int error; 3354 int allocbt_blks; 3355 3356 trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno); 3357 3358 /* We don't support trylock when freeing. */ 3359 ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) != 3360 (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)); 3361 error = xfs_read_agf(pag, tp, 3362 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0, 3363 &agfbp); 3364 if (error) 3365 return error; 3366 3367 agf = agfbp->b_addr; 3368 if (!xfs_perag_initialised_agf(pag)) { 3369 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks); 3370 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks); 3371 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount); 3372 pag->pagf_longest = be32_to_cpu(agf->agf_longest); 3373 pag->pagf_bno_level = be32_to_cpu(agf->agf_bno_level); 3374 pag->pagf_cnt_level = be32_to_cpu(agf->agf_cnt_level); 3375 pag->pagf_rmap_level = be32_to_cpu(agf->agf_rmap_level); 3376 pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level); 3377 if (xfs_agfl_needs_reset(pag->pag_mount, agf)) 3378 set_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); 3379 else 3380 clear_bit(XFS_AGSTATE_AGFL_NEEDS_RESET, &pag->pag_opstate); 3381 3382 /* 3383 * Update the in-core allocbt counter. Filter out the rmapbt 3384 * subset of the btreeblks counter because the rmapbt is managed 3385 * by perag reservation. Subtract one for the rmapbt root block 3386 * because the rmap counter includes it while the btreeblks 3387 * counter only tracks non-root blocks. 3388 */ 3389 allocbt_blks = pag->pagf_btreeblks; 3390 if (xfs_has_rmapbt(pag->pag_mount)) 3391 allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1; 3392 if (allocbt_blks > 0) 3393 atomic64_add(allocbt_blks, 3394 &pag->pag_mount->m_allocbt_blks); 3395 3396 set_bit(XFS_AGSTATE_AGF_INIT, &pag->pag_opstate); 3397 } 3398 #ifdef DEBUG 3399 else if (!xfs_is_shutdown(pag->pag_mount)) { 3400 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks)); 3401 ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks)); 3402 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount)); 3403 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest)); 3404 ASSERT(pag->pagf_bno_level == be32_to_cpu(agf->agf_bno_level)); 3405 ASSERT(pag->pagf_cnt_level == be32_to_cpu(agf->agf_cnt_level)); 3406 } 3407 #endif 3408 if (agfbpp) 3409 *agfbpp = agfbp; 3410 else 3411 xfs_trans_brelse(tp, agfbp); 3412 return 0; 3413 } 3414 3415 /* 3416 * Pre-proces allocation arguments to set initial state that we don't require 3417 * callers to set up correctly, as well as bounds check the allocation args 3418 * that are set up. 3419 */ 3420 static int 3421 xfs_alloc_vextent_check_args( 3422 struct xfs_alloc_arg *args, 3423 xfs_fsblock_t target, 3424 xfs_agnumber_t *minimum_agno) 3425 { 3426 struct xfs_mount *mp = args->mp; 3427 xfs_agblock_t agsize; 3428 3429 args->fsbno = NULLFSBLOCK; 3430 3431 *minimum_agno = 0; 3432 if (args->tp->t_highest_agno != NULLAGNUMBER) 3433 *minimum_agno = args->tp->t_highest_agno; 3434 3435 /* 3436 * Just fix this up, for the case where the last a.g. is shorter 3437 * (or there's only one a.g.) and the caller couldn't easily figure 3438 * that out (xfs_bmap_alloc). 3439 */ 3440 agsize = mp->m_sb.sb_agblocks; 3441 if (args->maxlen > agsize) 3442 args->maxlen = agsize; 3443 if (args->alignment == 0) 3444 args->alignment = 1; 3445 3446 ASSERT(args->minlen > 0); 3447 ASSERT(args->maxlen > 0); 3448 ASSERT(args->alignment > 0); 3449 ASSERT(args->resv != XFS_AG_RESV_AGFL); 3450 3451 ASSERT(XFS_FSB_TO_AGNO(mp, target) < mp->m_sb.sb_agcount); 3452 ASSERT(XFS_FSB_TO_AGBNO(mp, target) < agsize); 3453 ASSERT(args->minlen <= args->maxlen); 3454 ASSERT(args->minlen <= agsize); 3455 ASSERT(args->mod < args->prod); 3456 3457 if (XFS_FSB_TO_AGNO(mp, target) >= mp->m_sb.sb_agcount || 3458 XFS_FSB_TO_AGBNO(mp, target) >= agsize || 3459 args->minlen > args->maxlen || args->minlen > agsize || 3460 args->mod >= args->prod) { 3461 trace_xfs_alloc_vextent_badargs(args); 3462 return -ENOSPC; 3463 } 3464 3465 if (args->agno != NULLAGNUMBER && *minimum_agno > args->agno) { 3466 trace_xfs_alloc_vextent_skip_deadlock(args); 3467 return -ENOSPC; 3468 } 3469 return 0; 3470 3471 } 3472 3473 /* 3474 * Prepare an AG for allocation. If the AG is not prepared to accept the 3475 * allocation, return failure. 3476 * 3477 * XXX(dgc): The complexity of "need_pag" will go away as all caller paths are 3478 * modified to hold their own perag references. 3479 */ 3480 static int 3481 xfs_alloc_vextent_prepare_ag( 3482 struct xfs_alloc_arg *args, 3483 uint32_t alloc_flags) 3484 { 3485 bool need_pag = !args->pag; 3486 int error; 3487 3488 if (need_pag) 3489 args->pag = xfs_perag_get(args->mp, args->agno); 3490 3491 args->agbp = NULL; 3492 error = xfs_alloc_fix_freelist(args, alloc_flags); 3493 if (error) { 3494 trace_xfs_alloc_vextent_nofix(args); 3495 if (need_pag) 3496 xfs_perag_put(args->pag); 3497 args->agbno = NULLAGBLOCK; 3498 return error; 3499 } 3500 if (!args->agbp) { 3501 /* cannot allocate in this AG at all */ 3502 trace_xfs_alloc_vextent_noagbp(args); 3503 args->agbno = NULLAGBLOCK; 3504 return 0; 3505 } 3506 args->wasfromfl = 0; 3507 return 0; 3508 } 3509 3510 /* 3511 * Post-process allocation results to account for the allocation if it succeed 3512 * and set the allocated block number correctly for the caller. 3513 * 3514 * XXX: we should really be returning ENOSPC for ENOSPC, not 3515 * hiding it behind a "successful" NULLFSBLOCK allocation. 3516 */ 3517 static int 3518 xfs_alloc_vextent_finish( 3519 struct xfs_alloc_arg *args, 3520 xfs_agnumber_t minimum_agno, 3521 int alloc_error, 3522 bool drop_perag) 3523 { 3524 struct xfs_mount *mp = args->mp; 3525 int error = 0; 3526 3527 /* 3528 * We can end up here with a locked AGF. If we failed, the caller is 3529 * likely going to try to allocate again with different parameters, and 3530 * that can widen the AGs that are searched for free space. If we have 3531 * to do BMBT block allocation, we have to do a new allocation. 3532 * 3533 * Hence leaving this function with the AGF locked opens up potential 3534 * ABBA AGF deadlocks because a future allocation attempt in this 3535 * transaction may attempt to lock a lower number AGF. 3536 * 3537 * We can't release the AGF until the transaction is commited, so at 3538 * this point we must update the "first allocation" tracker to point at 3539 * this AG if the tracker is empty or points to a lower AG. This allows 3540 * the next allocation attempt to be modified appropriately to avoid 3541 * deadlocks. 3542 */ 3543 if (args->agbp && 3544 (args->tp->t_highest_agno == NULLAGNUMBER || 3545 args->agno > minimum_agno)) 3546 args->tp->t_highest_agno = args->agno; 3547 3548 /* 3549 * If the allocation failed with an error or we had an ENOSPC result, 3550 * preserve the returned error whilst also marking the allocation result 3551 * as "no extent allocated". This ensures that callers that fail to 3552 * capture the error will still treat it as a failed allocation. 3553 */ 3554 if (alloc_error || args->agbno == NULLAGBLOCK) { 3555 args->fsbno = NULLFSBLOCK; 3556 error = alloc_error; 3557 goto out_drop_perag; 3558 } 3559 3560 args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno); 3561 3562 ASSERT(args->len >= args->minlen); 3563 ASSERT(args->len <= args->maxlen); 3564 ASSERT(args->agbno % args->alignment == 0); 3565 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno), args->len); 3566 3567 /* if not file data, insert new block into the reverse map btree */ 3568 if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) { 3569 error = xfs_rmap_alloc(args->tp, args->agbp, args->pag, 3570 args->agbno, args->len, &args->oinfo); 3571 if (error) 3572 goto out_drop_perag; 3573 } 3574 3575 if (!args->wasfromfl) { 3576 error = xfs_alloc_update_counters(args->tp, args->agbp, 3577 -((long)(args->len))); 3578 if (error) 3579 goto out_drop_perag; 3580 3581 ASSERT(!xfs_extent_busy_search(mp, args->pag, args->agbno, 3582 args->len)); 3583 } 3584 3585 xfs_ag_resv_alloc_extent(args->pag, args->resv, args); 3586 3587 XFS_STATS_INC(mp, xs_allocx); 3588 XFS_STATS_ADD(mp, xs_allocb, args->len); 3589 3590 trace_xfs_alloc_vextent_finish(args); 3591 3592 out_drop_perag: 3593 if (drop_perag && args->pag) { 3594 xfs_perag_rele(args->pag); 3595 args->pag = NULL; 3596 } 3597 return error; 3598 } 3599 3600 /* 3601 * Allocate within a single AG only. This uses a best-fit length algorithm so if 3602 * you need an exact sized allocation without locality constraints, this is the 3603 * fastest way to do it. 3604 * 3605 * Caller is expected to hold a perag reference in args->pag. 3606 */ 3607 int 3608 xfs_alloc_vextent_this_ag( 3609 struct xfs_alloc_arg *args, 3610 xfs_agnumber_t agno) 3611 { 3612 struct xfs_mount *mp = args->mp; 3613 xfs_agnumber_t minimum_agno; 3614 uint32_t alloc_flags = 0; 3615 int error; 3616 3617 ASSERT(args->pag != NULL); 3618 ASSERT(args->pag->pag_agno == agno); 3619 3620 args->agno = agno; 3621 args->agbno = 0; 3622 3623 trace_xfs_alloc_vextent_this_ag(args); 3624 3625 error = xfs_alloc_vextent_check_args(args, XFS_AGB_TO_FSB(mp, agno, 0), 3626 &minimum_agno); 3627 if (error) { 3628 if (error == -ENOSPC) 3629 return 0; 3630 return error; 3631 } 3632 3633 error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); 3634 if (!error && args->agbp) 3635 error = xfs_alloc_ag_vextent_size(args, alloc_flags); 3636 3637 return xfs_alloc_vextent_finish(args, minimum_agno, error, false); 3638 } 3639 3640 /* 3641 * Iterate all AGs trying to allocate an extent starting from @start_ag. 3642 * 3643 * If the incoming allocation type is XFS_ALLOCTYPE_NEAR_BNO, it means the 3644 * allocation attempts in @start_agno have locality information. If we fail to 3645 * allocate in that AG, then we revert to anywhere-in-AG for all the other AGs 3646 * we attempt to allocation in as there is no locality optimisation possible for 3647 * those allocations. 3648 * 3649 * On return, args->pag may be left referenced if we finish before the "all 3650 * failed" return point. The allocation finish still needs the perag, and 3651 * so the caller will release it once they've finished the allocation. 3652 * 3653 * When we wrap the AG iteration at the end of the filesystem, we have to be 3654 * careful not to wrap into AGs below ones we already have locked in the 3655 * transaction if we are doing a blocking iteration. This will result in an 3656 * out-of-order locking of AGFs and hence can cause deadlocks. 3657 */ 3658 static int 3659 xfs_alloc_vextent_iterate_ags( 3660 struct xfs_alloc_arg *args, 3661 xfs_agnumber_t minimum_agno, 3662 xfs_agnumber_t start_agno, 3663 xfs_agblock_t target_agbno, 3664 uint32_t alloc_flags) 3665 { 3666 struct xfs_mount *mp = args->mp; 3667 xfs_agnumber_t restart_agno = minimum_agno; 3668 xfs_agnumber_t agno; 3669 int error = 0; 3670 3671 if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) 3672 restart_agno = 0; 3673 restart: 3674 for_each_perag_wrap_range(mp, start_agno, restart_agno, 3675 mp->m_sb.sb_agcount, agno, args->pag) { 3676 args->agno = agno; 3677 error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); 3678 if (error) 3679 break; 3680 if (!args->agbp) { 3681 trace_xfs_alloc_vextent_loopfailed(args); 3682 continue; 3683 } 3684 3685 /* 3686 * Allocation is supposed to succeed now, so break out of the 3687 * loop regardless of whether we succeed or not. 3688 */ 3689 if (args->agno == start_agno && target_agbno) { 3690 args->agbno = target_agbno; 3691 error = xfs_alloc_ag_vextent_near(args, alloc_flags); 3692 } else { 3693 args->agbno = 0; 3694 error = xfs_alloc_ag_vextent_size(args, alloc_flags); 3695 } 3696 break; 3697 } 3698 if (error) { 3699 xfs_perag_rele(args->pag); 3700 args->pag = NULL; 3701 return error; 3702 } 3703 if (args->agbp) 3704 return 0; 3705 3706 /* 3707 * We didn't find an AG we can alloation from. If we were given 3708 * constraining flags by the caller, drop them and retry the allocation 3709 * without any constraints being set. 3710 */ 3711 if (alloc_flags & XFS_ALLOC_FLAG_TRYLOCK) { 3712 alloc_flags &= ~XFS_ALLOC_FLAG_TRYLOCK; 3713 restart_agno = minimum_agno; 3714 goto restart; 3715 } 3716 3717 ASSERT(args->pag == NULL); 3718 trace_xfs_alloc_vextent_allfailed(args); 3719 return 0; 3720 } 3721 3722 /* 3723 * Iterate from the AGs from the start AG to the end of the filesystem, trying 3724 * to allocate blocks. It starts with a near allocation attempt in the initial 3725 * AG, then falls back to anywhere-in-ag after the first AG fails. It will wrap 3726 * back to zero if allowed by previous allocations in this transaction, 3727 * otherwise will wrap back to the start AG and run a second blocking pass to 3728 * the end of the filesystem. 3729 */ 3730 int 3731 xfs_alloc_vextent_start_ag( 3732 struct xfs_alloc_arg *args, 3733 xfs_fsblock_t target) 3734 { 3735 struct xfs_mount *mp = args->mp; 3736 xfs_agnumber_t minimum_agno; 3737 xfs_agnumber_t start_agno; 3738 xfs_agnumber_t rotorstep = xfs_rotorstep; 3739 bool bump_rotor = false; 3740 uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK; 3741 int error; 3742 3743 ASSERT(args->pag == NULL); 3744 3745 args->agno = NULLAGNUMBER; 3746 args->agbno = NULLAGBLOCK; 3747 3748 trace_xfs_alloc_vextent_start_ag(args); 3749 3750 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3751 if (error) { 3752 if (error == -ENOSPC) 3753 return 0; 3754 return error; 3755 } 3756 3757 if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) && 3758 xfs_is_inode32(mp)) { 3759 target = XFS_AGB_TO_FSB(mp, 3760 ((mp->m_agfrotor / rotorstep) % 3761 mp->m_sb.sb_agcount), 0); 3762 bump_rotor = 1; 3763 } 3764 3765 start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); 3766 error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, 3767 XFS_FSB_TO_AGBNO(mp, target), alloc_flags); 3768 3769 if (bump_rotor) { 3770 if (args->agno == start_agno) 3771 mp->m_agfrotor = (mp->m_agfrotor + 1) % 3772 (mp->m_sb.sb_agcount * rotorstep); 3773 else 3774 mp->m_agfrotor = (args->agno * rotorstep + 1) % 3775 (mp->m_sb.sb_agcount * rotorstep); 3776 } 3777 3778 return xfs_alloc_vextent_finish(args, minimum_agno, error, true); 3779 } 3780 3781 /* 3782 * Iterate from the agno indicated via @target through to the end of the 3783 * filesystem attempting blocking allocation. This does not wrap or try a second 3784 * pass, so will not recurse into AGs lower than indicated by the target. 3785 */ 3786 int 3787 xfs_alloc_vextent_first_ag( 3788 struct xfs_alloc_arg *args, 3789 xfs_fsblock_t target) 3790 { 3791 struct xfs_mount *mp = args->mp; 3792 xfs_agnumber_t minimum_agno; 3793 xfs_agnumber_t start_agno; 3794 uint32_t alloc_flags = XFS_ALLOC_FLAG_TRYLOCK; 3795 int error; 3796 3797 ASSERT(args->pag == NULL); 3798 3799 args->agno = NULLAGNUMBER; 3800 args->agbno = NULLAGBLOCK; 3801 3802 trace_xfs_alloc_vextent_first_ag(args); 3803 3804 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3805 if (error) { 3806 if (error == -ENOSPC) 3807 return 0; 3808 return error; 3809 } 3810 3811 start_agno = max(minimum_agno, XFS_FSB_TO_AGNO(mp, target)); 3812 error = xfs_alloc_vextent_iterate_ags(args, minimum_agno, start_agno, 3813 XFS_FSB_TO_AGBNO(mp, target), alloc_flags); 3814 return xfs_alloc_vextent_finish(args, minimum_agno, error, true); 3815 } 3816 3817 /* 3818 * Allocate at the exact block target or fail. Caller is expected to hold a 3819 * perag reference in args->pag. 3820 */ 3821 int 3822 xfs_alloc_vextent_exact_bno( 3823 struct xfs_alloc_arg *args, 3824 xfs_fsblock_t target) 3825 { 3826 struct xfs_mount *mp = args->mp; 3827 xfs_agnumber_t minimum_agno; 3828 int error; 3829 3830 ASSERT(args->pag != NULL); 3831 ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target)); 3832 3833 args->agno = XFS_FSB_TO_AGNO(mp, target); 3834 args->agbno = XFS_FSB_TO_AGBNO(mp, target); 3835 3836 trace_xfs_alloc_vextent_exact_bno(args); 3837 3838 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3839 if (error) { 3840 if (error == -ENOSPC) 3841 return 0; 3842 return error; 3843 } 3844 3845 error = xfs_alloc_vextent_prepare_ag(args, 0); 3846 if (!error && args->agbp) 3847 error = xfs_alloc_ag_vextent_exact(args); 3848 3849 return xfs_alloc_vextent_finish(args, minimum_agno, error, false); 3850 } 3851 3852 /* 3853 * Allocate an extent as close to the target as possible. If there are not 3854 * viable candidates in the AG, then fail the allocation. 3855 * 3856 * Caller may or may not have a per-ag reference in args->pag. 3857 */ 3858 int 3859 xfs_alloc_vextent_near_bno( 3860 struct xfs_alloc_arg *args, 3861 xfs_fsblock_t target) 3862 { 3863 struct xfs_mount *mp = args->mp; 3864 xfs_agnumber_t minimum_agno; 3865 bool needs_perag = args->pag == NULL; 3866 uint32_t alloc_flags = 0; 3867 int error; 3868 3869 if (!needs_perag) 3870 ASSERT(args->pag->pag_agno == XFS_FSB_TO_AGNO(mp, target)); 3871 3872 args->agno = XFS_FSB_TO_AGNO(mp, target); 3873 args->agbno = XFS_FSB_TO_AGBNO(mp, target); 3874 3875 trace_xfs_alloc_vextent_near_bno(args); 3876 3877 error = xfs_alloc_vextent_check_args(args, target, &minimum_agno); 3878 if (error) { 3879 if (error == -ENOSPC) 3880 return 0; 3881 return error; 3882 } 3883 3884 if (needs_perag) 3885 args->pag = xfs_perag_grab(mp, args->agno); 3886 3887 error = xfs_alloc_vextent_prepare_ag(args, alloc_flags); 3888 if (!error && args->agbp) 3889 error = xfs_alloc_ag_vextent_near(args, alloc_flags); 3890 3891 return xfs_alloc_vextent_finish(args, minimum_agno, error, needs_perag); 3892 } 3893 3894 /* Ensure that the freelist is at full capacity. */ 3895 int 3896 xfs_free_extent_fix_freelist( 3897 struct xfs_trans *tp, 3898 struct xfs_perag *pag, 3899 struct xfs_buf **agbp) 3900 { 3901 struct xfs_alloc_arg args; 3902 int error; 3903 3904 memset(&args, 0, sizeof(struct xfs_alloc_arg)); 3905 args.tp = tp; 3906 args.mp = tp->t_mountp; 3907 args.agno = pag->pag_agno; 3908 args.pag = pag; 3909 3910 /* 3911 * validate that the block number is legal - the enables us to detect 3912 * and handle a silent filesystem corruption rather than crashing. 3913 */ 3914 if (args.agno >= args.mp->m_sb.sb_agcount) 3915 return -EFSCORRUPTED; 3916 3917 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING); 3918 if (error) 3919 return error; 3920 3921 *agbp = args.agbp; 3922 return 0; 3923 } 3924 3925 /* 3926 * Free an extent. 3927 * Just break up the extent address and hand off to xfs_free_ag_extent 3928 * after fixing up the freelist. 3929 */ 3930 int 3931 __xfs_free_extent( 3932 struct xfs_trans *tp, 3933 struct xfs_perag *pag, 3934 xfs_agblock_t agbno, 3935 xfs_extlen_t len, 3936 const struct xfs_owner_info *oinfo, 3937 enum xfs_ag_resv_type type, 3938 bool skip_discard) 3939 { 3940 struct xfs_mount *mp = tp->t_mountp; 3941 struct xfs_buf *agbp; 3942 struct xfs_agf *agf; 3943 int error; 3944 unsigned int busy_flags = 0; 3945 3946 ASSERT(len != 0); 3947 ASSERT(type != XFS_AG_RESV_AGFL); 3948 3949 if (XFS_TEST_ERROR(false, mp, 3950 XFS_ERRTAG_FREE_EXTENT)) 3951 return -EIO; 3952 3953 error = xfs_free_extent_fix_freelist(tp, pag, &agbp); 3954 if (error) { 3955 if (xfs_metadata_is_sick(error)) 3956 xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); 3957 return error; 3958 } 3959 3960 agf = agbp->b_addr; 3961 3962 if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) { 3963 xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); 3964 error = -EFSCORRUPTED; 3965 goto err_release; 3966 } 3967 3968 /* validate the extent size is legal now we have the agf locked */ 3969 if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) { 3970 xfs_ag_mark_sick(pag, XFS_SICK_AG_BNOBT); 3971 error = -EFSCORRUPTED; 3972 goto err_release; 3973 } 3974 3975 error = xfs_free_ag_extent(tp, agbp, pag->pag_agno, agbno, len, oinfo, 3976 type); 3977 if (error) 3978 goto err_release; 3979 3980 if (skip_discard) 3981 busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD; 3982 xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags); 3983 return 0; 3984 3985 err_release: 3986 xfs_trans_brelse(tp, agbp); 3987 return error; 3988 } 3989 3990 struct xfs_alloc_query_range_info { 3991 xfs_alloc_query_range_fn fn; 3992 void *priv; 3993 }; 3994 3995 /* Format btree record and pass to our callback. */ 3996 STATIC int 3997 xfs_alloc_query_range_helper( 3998 struct xfs_btree_cur *cur, 3999 const union xfs_btree_rec *rec, 4000 void *priv) 4001 { 4002 struct xfs_alloc_query_range_info *query = priv; 4003 struct xfs_alloc_rec_incore irec; 4004 xfs_failaddr_t fa; 4005 4006 xfs_alloc_btrec_to_irec(rec, &irec); 4007 fa = xfs_alloc_check_irec(cur->bc_ag.pag, &irec); 4008 if (fa) 4009 return xfs_alloc_complain_bad_rec(cur, fa, &irec); 4010 4011 return query->fn(cur, &irec, query->priv); 4012 } 4013 4014 /* Find all free space within a given range of blocks. */ 4015 int 4016 xfs_alloc_query_range( 4017 struct xfs_btree_cur *cur, 4018 const struct xfs_alloc_rec_incore *low_rec, 4019 const struct xfs_alloc_rec_incore *high_rec, 4020 xfs_alloc_query_range_fn fn, 4021 void *priv) 4022 { 4023 union xfs_btree_irec low_brec = { .a = *low_rec }; 4024 union xfs_btree_irec high_brec = { .a = *high_rec }; 4025 struct xfs_alloc_query_range_info query = { .priv = priv, .fn = fn }; 4026 4027 ASSERT(xfs_btree_is_bno(cur->bc_ops)); 4028 return xfs_btree_query_range(cur, &low_brec, &high_brec, 4029 xfs_alloc_query_range_helper, &query); 4030 } 4031 4032 /* Find all free space records. */ 4033 int 4034 xfs_alloc_query_all( 4035 struct xfs_btree_cur *cur, 4036 xfs_alloc_query_range_fn fn, 4037 void *priv) 4038 { 4039 struct xfs_alloc_query_range_info query; 4040 4041 ASSERT(xfs_btree_is_bno(cur->bc_ops)); 4042 query.priv = priv; 4043 query.fn = fn; 4044 return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query); 4045 } 4046 4047 /* 4048 * Scan part of the keyspace of the free space and tell us if the area has no 4049 * records, is fully mapped by records, or is partially filled. 4050 */ 4051 int 4052 xfs_alloc_has_records( 4053 struct xfs_btree_cur *cur, 4054 xfs_agblock_t bno, 4055 xfs_extlen_t len, 4056 enum xbtree_recpacking *outcome) 4057 { 4058 union xfs_btree_irec low; 4059 union xfs_btree_irec high; 4060 4061 memset(&low, 0, sizeof(low)); 4062 low.a.ar_startblock = bno; 4063 memset(&high, 0xFF, sizeof(high)); 4064 high.a.ar_startblock = bno + len - 1; 4065 4066 return xfs_btree_has_records(cur, &low, &high, NULL, outcome); 4067 } 4068 4069 /* 4070 * Walk all the blocks in the AGFL. The @walk_fn can return any negative 4071 * error code or XFS_ITER_*. 4072 */ 4073 int 4074 xfs_agfl_walk( 4075 struct xfs_mount *mp, 4076 struct xfs_agf *agf, 4077 struct xfs_buf *agflbp, 4078 xfs_agfl_walk_fn walk_fn, 4079 void *priv) 4080 { 4081 __be32 *agfl_bno; 4082 unsigned int i; 4083 int error; 4084 4085 agfl_bno = xfs_buf_to_agfl_bno(agflbp); 4086 i = be32_to_cpu(agf->agf_flfirst); 4087 4088 /* Nothing to walk in an empty AGFL. */ 4089 if (agf->agf_flcount == cpu_to_be32(0)) 4090 return 0; 4091 4092 /* Otherwise, walk from first to last, wrapping as needed. */ 4093 for (;;) { 4094 error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv); 4095 if (error) 4096 return error; 4097 if (i == be32_to_cpu(agf->agf_fllast)) 4098 break; 4099 if (++i == xfs_agfl_size(mp)) 4100 i = 0; 4101 } 4102 4103 return 0; 4104 } 4105 4106 int __init 4107 xfs_extfree_intent_init_cache(void) 4108 { 4109 xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent", 4110 sizeof(struct xfs_extent_free_item), 4111 0, 0, NULL); 4112 4113 return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM; 4114 } 4115 4116 void 4117 xfs_extfree_intent_destroy_cache(void) 4118 { 4119 kmem_cache_destroy(xfs_extfree_item_cache); 4120 xfs_extfree_item_cache = NULL; 4121 } 4122