1 // SPDX-License-Identifier: GPL-2.0-or-later 2 /* 3 * Copyright (C) 2019 Oracle. All Rights Reserved. 4 * Author: Darrick J. Wong <darrick.wong@oracle.com> 5 */ 6 #include "xfs.h" 7 #include "xfs_fs.h" 8 #include "xfs_shared.h" 9 #include "xfs_format.h" 10 #include "xfs_log_format.h" 11 #include "xfs_trans_resv.h" 12 #include "xfs_mount.h" 13 #include "xfs_inode.h" 14 #include "xfs_btree.h" 15 #include "xfs_ialloc.h" 16 #include "xfs_ialloc_btree.h" 17 #include "xfs_iwalk.h" 18 #include "xfs_error.h" 19 #include "xfs_trace.h" 20 #include "xfs_icache.h" 21 #include "xfs_health.h" 22 #include "xfs_trans.h" 23 #include "xfs_pwork.h" 24 #include "xfs_ag.h" 25 26 /* 27 * Walking Inodes in the Filesystem 28 * ================================ 29 * 30 * This iterator function walks a subset of filesystem inodes in increasing 31 * order from @startino until there are no more inodes. For each allocated 32 * inode it finds, it calls a walk function with the relevant inode number and 33 * a pointer to caller-provided data. The walk function can return the usual 34 * negative error code to stop the iteration; 0 to continue the iteration; or 35 * -ECANCELED to stop the iteration. This return value is returned to the 36 * caller. 37 * 38 * Internally, we allow the walk function to do anything, which means that we 39 * cannot maintain the inobt cursor or our lock on the AGI buffer. We 40 * therefore cache the inobt records in kernel memory and only call the walk 41 * function when our memory buffer is full. @nr_recs is the number of records 42 * that we've cached, and @sz_recs is the size of our cache. 43 * 44 * It is the responsibility of the walk function to ensure it accesses 45 * allocated inodes, as the inobt records may be stale by the time they are 46 * acted upon. 47 */ 48 49 struct xfs_iwalk_ag { 50 /* parallel work control data; will be null if single threaded */ 51 struct xfs_pwork pwork; 52 53 struct xfs_mount *mp; 54 struct xfs_trans *tp; 55 struct xfs_perag *pag; 56 57 /* Where do we start the traversal? */ 58 xfs_ino_t startino; 59 60 /* What was the last inode number we saw when iterating the inobt? */ 61 xfs_ino_t lastino; 62 63 /* Array of inobt records we cache. */ 64 struct xfs_inobt_rec_incore *recs; 65 66 /* Number of entries allocated for the @recs array. */ 67 unsigned int sz_recs; 68 69 /* Number of entries in the @recs array that are in use. */ 70 unsigned int nr_recs; 71 72 /* Inode walk function and data pointer. */ 73 xfs_iwalk_fn iwalk_fn; 74 xfs_inobt_walk_fn inobt_walk_fn; 75 void *data; 76 77 /* 78 * Make it look like the inodes up to startino are free so that 79 * bulkstat can start its inode iteration at the correct place without 80 * needing to special case everywhere. 81 */ 82 unsigned int trim_start:1; 83 84 /* Skip empty inobt records? */ 85 unsigned int skip_empty:1; 86 87 /* Drop the (hopefully empty) transaction when calling iwalk_fn. */ 88 unsigned int drop_trans:1; 89 }; 90 91 /* 92 * Loop over all clusters in a chunk for a given incore inode allocation btree 93 * record. Do a readahead if there are any allocated inodes in that cluster. 94 */ 95 STATIC void 96 xfs_iwalk_ichunk_ra( 97 struct xfs_mount *mp, 98 struct xfs_perag *pag, 99 struct xfs_inobt_rec_incore *irec) 100 { 101 struct xfs_ino_geometry *igeo = M_IGEO(mp); 102 xfs_agblock_t agbno; 103 struct blk_plug plug; 104 int i; /* inode chunk index */ 105 106 agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino); 107 108 blk_start_plug(&plug); 109 for (i = 0; i < XFS_INODES_PER_CHUNK; i += igeo->inodes_per_cluster) { 110 xfs_inofree_t imask; 111 112 imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster); 113 if (imask & ~irec->ir_free) { 114 xfs_btree_reada_bufs(mp, pag->pag_agno, agbno, 115 igeo->blocks_per_cluster, 116 &xfs_inode_buf_ops); 117 } 118 agbno += igeo->blocks_per_cluster; 119 } 120 blk_finish_plug(&plug); 121 } 122 123 /* 124 * Set the bits in @irec's free mask that correspond to the inodes before 125 * @agino so that we skip them. This is how we restart an inode walk that was 126 * interrupted in the middle of an inode record. 127 */ 128 STATIC void 129 xfs_iwalk_adjust_start( 130 xfs_agino_t agino, /* starting inode of chunk */ 131 struct xfs_inobt_rec_incore *irec) /* btree record */ 132 { 133 int idx; /* index into inode chunk */ 134 int i; 135 136 idx = agino - irec->ir_startino; 137 138 /* 139 * We got a right chunk with some left inodes allocated at it. Grab 140 * the chunk record. Mark all the uninteresting inodes free because 141 * they're before our start point. 142 */ 143 for (i = 0; i < idx; i++) { 144 if (XFS_INOBT_MASK(i) & ~irec->ir_free) 145 irec->ir_freecount++; 146 } 147 148 irec->ir_free |= xfs_inobt_maskn(0, idx); 149 } 150 151 /* Allocate memory for a walk. */ 152 STATIC int 153 xfs_iwalk_alloc( 154 struct xfs_iwalk_ag *iwag) 155 { 156 size_t size; 157 158 ASSERT(iwag->recs == NULL); 159 iwag->nr_recs = 0; 160 161 /* Allocate a prefetch buffer for inobt records. */ 162 size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore); 163 iwag->recs = kmem_alloc(size, KM_MAYFAIL); 164 if (iwag->recs == NULL) 165 return -ENOMEM; 166 167 return 0; 168 } 169 170 /* Free memory we allocated for a walk. */ 171 STATIC void 172 xfs_iwalk_free( 173 struct xfs_iwalk_ag *iwag) 174 { 175 kmem_free(iwag->recs); 176 iwag->recs = NULL; 177 } 178 179 /* For each inuse inode in each cached inobt record, call our function. */ 180 STATIC int 181 xfs_iwalk_ag_recs( 182 struct xfs_iwalk_ag *iwag) 183 { 184 struct xfs_mount *mp = iwag->mp; 185 struct xfs_trans *tp = iwag->tp; 186 struct xfs_perag *pag = iwag->pag; 187 xfs_ino_t ino; 188 unsigned int i, j; 189 int error; 190 191 for (i = 0; i < iwag->nr_recs; i++) { 192 struct xfs_inobt_rec_incore *irec = &iwag->recs[i]; 193 194 trace_xfs_iwalk_ag_rec(mp, pag->pag_agno, irec); 195 196 if (xfs_pwork_want_abort(&iwag->pwork)) 197 return 0; 198 199 if (iwag->inobt_walk_fn) { 200 error = iwag->inobt_walk_fn(mp, tp, pag->pag_agno, irec, 201 iwag->data); 202 if (error) 203 return error; 204 } 205 206 if (!iwag->iwalk_fn) 207 continue; 208 209 for (j = 0; j < XFS_INODES_PER_CHUNK; j++) { 210 if (xfs_pwork_want_abort(&iwag->pwork)) 211 return 0; 212 213 /* Skip if this inode is free */ 214 if (XFS_INOBT_MASK(j) & irec->ir_free) 215 continue; 216 217 /* Otherwise call our function. */ 218 ino = XFS_AGINO_TO_INO(mp, pag->pag_agno, 219 irec->ir_startino + j); 220 error = iwag->iwalk_fn(mp, tp, ino, iwag->data); 221 if (error) 222 return error; 223 } 224 } 225 226 return 0; 227 } 228 229 /* Delete cursor and let go of AGI. */ 230 static inline void 231 xfs_iwalk_del_inobt( 232 struct xfs_trans *tp, 233 struct xfs_btree_cur **curpp, 234 struct xfs_buf **agi_bpp, 235 int error) 236 { 237 if (*curpp) { 238 xfs_btree_del_cursor(*curpp, error); 239 *curpp = NULL; 240 } 241 if (*agi_bpp) { 242 xfs_trans_brelse(tp, *agi_bpp); 243 *agi_bpp = NULL; 244 } 245 } 246 247 /* 248 * Set ourselves up for walking inobt records starting from a given point in 249 * the filesystem. 250 * 251 * If caller passed in a nonzero start inode number, load the record from the 252 * inobt and make the record look like all the inodes before agino are free so 253 * that we skip them, and then move the cursor to the next inobt record. This 254 * is how we support starting an iwalk in the middle of an inode chunk. 255 * 256 * If the caller passed in a start number of zero, move the cursor to the first 257 * inobt record. 258 * 259 * The caller is responsible for cleaning up the cursor and buffer pointer 260 * regardless of the error status. 261 */ 262 STATIC int 263 xfs_iwalk_ag_start( 264 struct xfs_iwalk_ag *iwag, 265 xfs_agino_t agino, 266 struct xfs_btree_cur **curpp, 267 struct xfs_buf **agi_bpp, 268 int *has_more) 269 { 270 struct xfs_mount *mp = iwag->mp; 271 struct xfs_trans *tp = iwag->tp; 272 struct xfs_perag *pag = iwag->pag; 273 struct xfs_inobt_rec_incore *irec; 274 int error; 275 276 /* Set up a fresh cursor and empty the inobt cache. */ 277 iwag->nr_recs = 0; 278 error = xfs_inobt_cur(pag, tp, XFS_BTNUM_INO, curpp, agi_bpp); 279 if (error) 280 return error; 281 282 /* Starting at the beginning of the AG? That's easy! */ 283 if (agino == 0) 284 return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more); 285 286 /* 287 * Otherwise, we have to grab the inobt record where we left off, stuff 288 * the record into our cache, and then see if there are more records. 289 * We require a lookup cache of at least two elements so that the 290 * caller doesn't have to deal with tearing down the cursor to walk the 291 * records. 292 */ 293 error = xfs_inobt_lookup(*curpp, agino, XFS_LOOKUP_LE, has_more); 294 if (error) 295 return error; 296 297 /* 298 * If the LE lookup at @agino yields no records, jump ahead to the 299 * inobt cursor increment to see if there are more records to process. 300 */ 301 if (!*has_more) 302 goto out_advance; 303 304 /* Get the record, should always work */ 305 irec = &iwag->recs[iwag->nr_recs]; 306 error = xfs_inobt_get_rec(*curpp, irec, has_more); 307 if (error) 308 return error; 309 if (XFS_IS_CORRUPT(mp, *has_more != 1)) 310 return -EFSCORRUPTED; 311 312 iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno, 313 irec->ir_startino + XFS_INODES_PER_CHUNK - 1); 314 315 /* 316 * If the LE lookup yielded an inobt record before the cursor position, 317 * skip it and see if there's another one after it. 318 */ 319 if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino) 320 goto out_advance; 321 322 /* 323 * If agino fell in the middle of the inode record, make it look like 324 * the inodes up to agino are free so that we don't return them again. 325 */ 326 if (iwag->trim_start) 327 xfs_iwalk_adjust_start(agino, irec); 328 329 /* 330 * The prefetch calculation is supposed to give us a large enough inobt 331 * record cache that grab_ichunk can stage a partial first record and 332 * the loop body can cache a record without having to check for cache 333 * space until after it reads an inobt record. 334 */ 335 iwag->nr_recs++; 336 ASSERT(iwag->nr_recs < iwag->sz_recs); 337 338 out_advance: 339 return xfs_btree_increment(*curpp, 0, has_more); 340 } 341 342 /* 343 * The inobt record cache is full, so preserve the inobt cursor state and 344 * run callbacks on the cached inobt records. When we're done, restore the 345 * cursor state to wherever the cursor would have been had the cache not been 346 * full (and therefore we could've just incremented the cursor) if *@has_more 347 * is true. On exit, *@has_more will indicate whether or not the caller should 348 * try for more inode records. 349 */ 350 STATIC int 351 xfs_iwalk_run_callbacks( 352 struct xfs_iwalk_ag *iwag, 353 struct xfs_btree_cur **curpp, 354 struct xfs_buf **agi_bpp, 355 int *has_more) 356 { 357 struct xfs_mount *mp = iwag->mp; 358 struct xfs_inobt_rec_incore *irec; 359 xfs_agino_t next_agino; 360 int error; 361 362 next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1; 363 364 ASSERT(iwag->nr_recs > 0); 365 366 /* Delete cursor but remember the last record we cached... */ 367 xfs_iwalk_del_inobt(iwag->tp, curpp, agi_bpp, 0); 368 irec = &iwag->recs[iwag->nr_recs - 1]; 369 ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK); 370 371 if (iwag->drop_trans) { 372 xfs_trans_cancel(iwag->tp); 373 iwag->tp = NULL; 374 } 375 376 error = xfs_iwalk_ag_recs(iwag); 377 if (error) 378 return error; 379 380 /* ...empty the cache... */ 381 iwag->nr_recs = 0; 382 383 if (!has_more) 384 return 0; 385 386 if (iwag->drop_trans) { 387 error = xfs_trans_alloc_empty(mp, &iwag->tp); 388 if (error) 389 return error; 390 } 391 392 /* ...and recreate the cursor just past where we left off. */ 393 error = xfs_inobt_cur(iwag->pag, iwag->tp, XFS_BTNUM_INO, curpp, 394 agi_bpp); 395 if (error) 396 return error; 397 398 return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more); 399 } 400 401 /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */ 402 STATIC int 403 xfs_iwalk_ag( 404 struct xfs_iwalk_ag *iwag) 405 { 406 struct xfs_mount *mp = iwag->mp; 407 struct xfs_perag *pag = iwag->pag; 408 struct xfs_buf *agi_bp = NULL; 409 struct xfs_btree_cur *cur = NULL; 410 xfs_agino_t agino; 411 int has_more; 412 int error = 0; 413 414 /* Set up our cursor at the right place in the inode btree. */ 415 ASSERT(pag->pag_agno == XFS_INO_TO_AGNO(mp, iwag->startino)); 416 agino = XFS_INO_TO_AGINO(mp, iwag->startino); 417 error = xfs_iwalk_ag_start(iwag, agino, &cur, &agi_bp, &has_more); 418 419 while (!error && has_more) { 420 struct xfs_inobt_rec_incore *irec; 421 xfs_ino_t rec_fsino; 422 423 cond_resched(); 424 if (xfs_pwork_want_abort(&iwag->pwork)) 425 goto out; 426 427 /* Fetch the inobt record. */ 428 irec = &iwag->recs[iwag->nr_recs]; 429 error = xfs_inobt_get_rec(cur, irec, &has_more); 430 if (error || !has_more) 431 break; 432 433 /* Make sure that we always move forward. */ 434 rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino); 435 if (iwag->lastino != NULLFSINO && 436 XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) { 437 error = -EFSCORRUPTED; 438 goto out; 439 } 440 iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1; 441 442 /* No allocated inodes in this chunk; skip it. */ 443 if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) { 444 error = xfs_btree_increment(cur, 0, &has_more); 445 if (error) 446 break; 447 continue; 448 } 449 450 /* 451 * Start readahead for this inode chunk in anticipation of 452 * walking the inodes. 453 */ 454 if (iwag->iwalk_fn) 455 xfs_iwalk_ichunk_ra(mp, pag, irec); 456 457 /* 458 * If there's space in the buffer for more records, increment 459 * the btree cursor and grab more. 460 */ 461 if (++iwag->nr_recs < iwag->sz_recs) { 462 error = xfs_btree_increment(cur, 0, &has_more); 463 if (error || !has_more) 464 break; 465 continue; 466 } 467 468 /* 469 * Otherwise, we need to save cursor state and run the callback 470 * function on the cached records. The run_callbacks function 471 * is supposed to return a cursor pointing to the record where 472 * we would be if we had been able to increment like above. 473 */ 474 ASSERT(has_more); 475 error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more); 476 } 477 478 if (iwag->nr_recs == 0 || error) 479 goto out; 480 481 /* Walk the unprocessed records in the cache. */ 482 error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more); 483 484 out: 485 xfs_iwalk_del_inobt(iwag->tp, &cur, &agi_bp, error); 486 return error; 487 } 488 489 /* 490 * We experimentally determined that the reduction in ioctl call overhead 491 * diminishes when userspace asks for more than 2048 inodes, so we'll cap 492 * prefetch at this point. 493 */ 494 #define IWALK_MAX_INODE_PREFETCH (2048U) 495 496 /* 497 * Given the number of inodes to prefetch, set the number of inobt records that 498 * we cache in memory, which controls the number of inodes we try to read 499 * ahead. Set the maximum if @inodes == 0. 500 */ 501 static inline unsigned int 502 xfs_iwalk_prefetch( 503 unsigned int inodes) 504 { 505 unsigned int inobt_records; 506 507 /* 508 * If the caller didn't tell us the number of inodes they wanted, 509 * assume the maximum prefetch possible for best performance. 510 * Otherwise, cap prefetch at that maximum so that we don't start an 511 * absurd amount of prefetch. 512 */ 513 if (inodes == 0) 514 inodes = IWALK_MAX_INODE_PREFETCH; 515 inodes = min(inodes, IWALK_MAX_INODE_PREFETCH); 516 517 /* Round the inode count up to a full chunk. */ 518 inodes = round_up(inodes, XFS_INODES_PER_CHUNK); 519 520 /* 521 * In order to convert the number of inodes to prefetch into an 522 * estimate of the number of inobt records to cache, we require a 523 * conversion factor that reflects our expectations of the average 524 * loading factor of an inode chunk. Based on data gathered, most 525 * (but not all) filesystems manage to keep the inode chunks totally 526 * full, so we'll underestimate slightly so that our readahead will 527 * still deliver the performance we want on aging filesystems: 528 * 529 * inobt = inodes / (INODES_PER_CHUNK * (4 / 5)); 530 * 531 * The funny math is to avoid integer division. 532 */ 533 inobt_records = (inodes * 5) / (4 * XFS_INODES_PER_CHUNK); 534 535 /* 536 * Allocate enough space to prefetch at least two inobt records so that 537 * we can cache both the record where the iwalk started and the next 538 * record. This simplifies the AG inode walk loop setup code. 539 */ 540 return max(inobt_records, 2U); 541 } 542 543 /* 544 * Walk all inodes in the filesystem starting from @startino. The @iwalk_fn 545 * will be called for each allocated inode, being passed the inode's number and 546 * @data. @max_prefetch controls how many inobt records' worth of inodes we 547 * try to readahead. 548 */ 549 int 550 xfs_iwalk( 551 struct xfs_mount *mp, 552 struct xfs_trans *tp, 553 xfs_ino_t startino, 554 unsigned int flags, 555 xfs_iwalk_fn iwalk_fn, 556 unsigned int inode_records, 557 void *data) 558 { 559 struct xfs_iwalk_ag iwag = { 560 .mp = mp, 561 .tp = tp, 562 .iwalk_fn = iwalk_fn, 563 .data = data, 564 .startino = startino, 565 .sz_recs = xfs_iwalk_prefetch(inode_records), 566 .trim_start = 1, 567 .skip_empty = 1, 568 .pwork = XFS_PWORK_SINGLE_THREADED, 569 .lastino = NULLFSINO, 570 }; 571 struct xfs_perag *pag; 572 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); 573 int error; 574 575 ASSERT(agno < mp->m_sb.sb_agcount); 576 ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL)); 577 578 error = xfs_iwalk_alloc(&iwag); 579 if (error) 580 return error; 581 582 for_each_perag_from(mp, agno, pag) { 583 iwag.pag = pag; 584 error = xfs_iwalk_ag(&iwag); 585 if (error) 586 break; 587 iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0); 588 if (flags & XFS_INOBT_WALK_SAME_AG) 589 break; 590 iwag.pag = NULL; 591 } 592 593 if (iwag.pag) 594 xfs_perag_rele(pag); 595 xfs_iwalk_free(&iwag); 596 return error; 597 } 598 599 /* Run per-thread iwalk work. */ 600 static int 601 xfs_iwalk_ag_work( 602 struct xfs_mount *mp, 603 struct xfs_pwork *pwork) 604 { 605 struct xfs_iwalk_ag *iwag; 606 int error = 0; 607 608 iwag = container_of(pwork, struct xfs_iwalk_ag, pwork); 609 if (xfs_pwork_want_abort(pwork)) 610 goto out; 611 612 error = xfs_iwalk_alloc(iwag); 613 if (error) 614 goto out; 615 /* 616 * Grab an empty transaction so that we can use its recursive buffer 617 * locking abilities to detect cycles in the inobt without deadlocking. 618 */ 619 error = xfs_trans_alloc_empty(mp, &iwag->tp); 620 if (error) 621 goto out; 622 iwag->drop_trans = 1; 623 624 error = xfs_iwalk_ag(iwag); 625 if (iwag->tp) 626 xfs_trans_cancel(iwag->tp); 627 xfs_iwalk_free(iwag); 628 out: 629 xfs_perag_put(iwag->pag); 630 kmem_free(iwag); 631 return error; 632 } 633 634 /* 635 * Walk all the inodes in the filesystem using multiple threads to process each 636 * AG. 637 */ 638 int 639 xfs_iwalk_threaded( 640 struct xfs_mount *mp, 641 xfs_ino_t startino, 642 unsigned int flags, 643 xfs_iwalk_fn iwalk_fn, 644 unsigned int inode_records, 645 bool polled, 646 void *data) 647 { 648 struct xfs_pwork_ctl pctl; 649 struct xfs_perag *pag; 650 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); 651 int error; 652 653 ASSERT(agno < mp->m_sb.sb_agcount); 654 ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL)); 655 656 error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk"); 657 if (error) 658 return error; 659 660 for_each_perag_from(mp, agno, pag) { 661 struct xfs_iwalk_ag *iwag; 662 663 if (xfs_pwork_ctl_want_abort(&pctl)) 664 break; 665 666 iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0); 667 iwag->mp = mp; 668 669 /* 670 * perag is being handed off to async work, so take a passive 671 * reference for the async work to release. 672 */ 673 iwag->pag = xfs_perag_hold(pag); 674 iwag->iwalk_fn = iwalk_fn; 675 iwag->data = data; 676 iwag->startino = startino; 677 iwag->sz_recs = xfs_iwalk_prefetch(inode_records); 678 iwag->lastino = NULLFSINO; 679 xfs_pwork_queue(&pctl, &iwag->pwork); 680 startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0); 681 if (flags & XFS_INOBT_WALK_SAME_AG) 682 break; 683 } 684 if (pag) 685 xfs_perag_rele(pag); 686 if (polled) 687 xfs_pwork_poll(&pctl); 688 return xfs_pwork_destroy(&pctl); 689 } 690 691 /* 692 * Allow callers to cache up to a page's worth of inobt records. This reflects 693 * the existing inumbers prefetching behavior. Since the inobt walk does not 694 * itself do anything with the inobt records, we can set a fairly high limit 695 * here. 696 */ 697 #define MAX_INOBT_WALK_PREFETCH \ 698 (PAGE_SIZE / sizeof(struct xfs_inobt_rec_incore)) 699 700 /* 701 * Given the number of records that the user wanted, set the number of inobt 702 * records that we buffer in memory. Set the maximum if @inobt_records == 0. 703 */ 704 static inline unsigned int 705 xfs_inobt_walk_prefetch( 706 unsigned int inobt_records) 707 { 708 /* 709 * If the caller didn't tell us the number of inobt records they 710 * wanted, assume the maximum prefetch possible for best performance. 711 */ 712 if (inobt_records == 0) 713 inobt_records = MAX_INOBT_WALK_PREFETCH; 714 715 /* 716 * Allocate enough space to prefetch at least two inobt records so that 717 * we can cache both the record where the iwalk started and the next 718 * record. This simplifies the AG inode walk loop setup code. 719 */ 720 inobt_records = max(inobt_records, 2U); 721 722 /* 723 * Cap prefetch at that maximum so that we don't use an absurd amount 724 * of memory. 725 */ 726 return min_t(unsigned int, inobt_records, MAX_INOBT_WALK_PREFETCH); 727 } 728 729 /* 730 * Walk all inode btree records in the filesystem starting from @startino. The 731 * @inobt_walk_fn will be called for each btree record, being passed the incore 732 * record and @data. @max_prefetch controls how many inobt records we try to 733 * cache ahead of time. 734 */ 735 int 736 xfs_inobt_walk( 737 struct xfs_mount *mp, 738 struct xfs_trans *tp, 739 xfs_ino_t startino, 740 unsigned int flags, 741 xfs_inobt_walk_fn inobt_walk_fn, 742 unsigned int inobt_records, 743 void *data) 744 { 745 struct xfs_iwalk_ag iwag = { 746 .mp = mp, 747 .tp = tp, 748 .inobt_walk_fn = inobt_walk_fn, 749 .data = data, 750 .startino = startino, 751 .sz_recs = xfs_inobt_walk_prefetch(inobt_records), 752 .pwork = XFS_PWORK_SINGLE_THREADED, 753 .lastino = NULLFSINO, 754 }; 755 struct xfs_perag *pag; 756 xfs_agnumber_t agno = XFS_INO_TO_AGNO(mp, startino); 757 int error; 758 759 ASSERT(agno < mp->m_sb.sb_agcount); 760 ASSERT(!(flags & ~XFS_INOBT_WALK_FLAGS_ALL)); 761 762 error = xfs_iwalk_alloc(&iwag); 763 if (error) 764 return error; 765 766 for_each_perag_from(mp, agno, pag) { 767 iwag.pag = pag; 768 error = xfs_iwalk_ag(&iwag); 769 if (error) 770 break; 771 iwag.startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0); 772 if (flags & XFS_INOBT_WALK_SAME_AG) 773 break; 774 iwag.pag = NULL; 775 } 776 777 if (iwag.pag) 778 xfs_perag_rele(pag); 779 xfs_iwalk_free(&iwag); 780 return error; 781 } 782