xref: /linux/fs/xfs/xfs_iwalk.c (revision d4b996f9ef1fe83d9ce9ad5c1ca0bd8231638ce5)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2019 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_mount.h"
13 #include "xfs_inode.h"
14 #include "xfs_btree.h"
15 #include "xfs_ialloc.h"
16 #include "xfs_ialloc_btree.h"
17 #include "xfs_iwalk.h"
18 #include "xfs_error.h"
19 #include "xfs_trace.h"
20 #include "xfs_icache.h"
21 #include "xfs_health.h"
22 #include "xfs_trans.h"
23 #include "xfs_pwork.h"
24 #include "xfs_ag.h"
25 
26 /*
27  * Walking Inodes in the Filesystem
28  * ================================
29  *
30  * This iterator function walks a subset of filesystem inodes in increasing
31  * order from @startino until there are no more inodes.  For each allocated
32  * inode it finds, it calls a walk function with the relevant inode number and
33  * a pointer to caller-provided data.  The walk function can return the usual
34  * negative error code to stop the iteration; 0 to continue the iteration; or
35  * -ECANCELED to stop the iteration.  This return value is returned to the
36  * caller.
37  *
38  * Internally, we allow the walk function to do anything, which means that we
39  * cannot maintain the inobt cursor or our lock on the AGI buffer.  We
40  * therefore cache the inobt records in kernel memory and only call the walk
41  * function when our memory buffer is full.  @nr_recs is the number of records
42  * that we've cached, and @sz_recs is the size of our cache.
43  *
44  * It is the responsibility of the walk function to ensure it accesses
45  * allocated inodes, as the inobt records may be stale by the time they are
46  * acted upon.
47  */
48 
49 struct xfs_iwalk_ag {
50 	/* parallel work control data; will be null if single threaded */
51 	struct xfs_pwork		pwork;
52 
53 	struct xfs_mount		*mp;
54 	struct xfs_trans		*tp;
55 	struct xfs_perag		*pag;
56 
57 	/* Where do we start the traversal? */
58 	xfs_ino_t			startino;
59 
60 	/* What was the last inode number we saw when iterating the inobt? */
61 	xfs_ino_t			lastino;
62 
63 	/* Array of inobt records we cache. */
64 	struct xfs_inobt_rec_incore	*recs;
65 
66 	/* Number of entries allocated for the @recs array. */
67 	unsigned int			sz_recs;
68 
69 	/* Number of entries in the @recs array that are in use. */
70 	unsigned int			nr_recs;
71 
72 	/* Inode walk function and data pointer. */
73 	xfs_iwalk_fn			iwalk_fn;
74 	xfs_inobt_walk_fn		inobt_walk_fn;
75 	void				*data;
76 
77 	/*
78 	 * Make it look like the inodes up to startino are free so that
79 	 * bulkstat can start its inode iteration at the correct place without
80 	 * needing to special case everywhere.
81 	 */
82 	unsigned int			trim_start:1;
83 
84 	/* Skip empty inobt records? */
85 	unsigned int			skip_empty:1;
86 };
87 
88 /*
89  * Loop over all clusters in a chunk for a given incore inode allocation btree
90  * record.  Do a readahead if there are any allocated inodes in that cluster.
91  */
92 STATIC void
93 xfs_iwalk_ichunk_ra(
94 	struct xfs_mount		*mp,
95 	struct xfs_perag		*pag,
96 	struct xfs_inobt_rec_incore	*irec)
97 {
98 	struct xfs_ino_geometry		*igeo = M_IGEO(mp);
99 	xfs_agblock_t			agbno;
100 	struct blk_plug			plug;
101 	int				i;	/* inode chunk index */
102 
103 	agbno = XFS_AGINO_TO_AGBNO(mp, irec->ir_startino);
104 
105 	blk_start_plug(&plug);
106 	for (i = 0; i < XFS_INODES_PER_CHUNK; i += igeo->inodes_per_cluster) {
107 		xfs_inofree_t	imask;
108 
109 		imask = xfs_inobt_maskn(i, igeo->inodes_per_cluster);
110 		if (imask & ~irec->ir_free) {
111 			xfs_btree_reada_bufs(mp, pag->pag_agno, agbno,
112 					igeo->blocks_per_cluster,
113 					&xfs_inode_buf_ops);
114 		}
115 		agbno += igeo->blocks_per_cluster;
116 	}
117 	blk_finish_plug(&plug);
118 }
119 
120 /*
121  * Set the bits in @irec's free mask that correspond to the inodes before
122  * @agino so that we skip them.  This is how we restart an inode walk that was
123  * interrupted in the middle of an inode record.
124  */
125 STATIC void
126 xfs_iwalk_adjust_start(
127 	xfs_agino_t			agino,	/* starting inode of chunk */
128 	struct xfs_inobt_rec_incore	*irec)	/* btree record */
129 {
130 	int				idx;	/* index into inode chunk */
131 	int				i;
132 
133 	idx = agino - irec->ir_startino;
134 
135 	/*
136 	 * We got a right chunk with some left inodes allocated at it.  Grab
137 	 * the chunk record.  Mark all the uninteresting inodes free because
138 	 * they're before our start point.
139 	 */
140 	for (i = 0; i < idx; i++) {
141 		if (XFS_INOBT_MASK(i) & ~irec->ir_free)
142 			irec->ir_freecount++;
143 	}
144 
145 	irec->ir_free |= xfs_inobt_maskn(0, idx);
146 }
147 
148 /* Allocate memory for a walk. */
149 STATIC int
150 xfs_iwalk_alloc(
151 	struct xfs_iwalk_ag	*iwag)
152 {
153 	size_t			size;
154 
155 	ASSERT(iwag->recs == NULL);
156 	iwag->nr_recs = 0;
157 
158 	/* Allocate a prefetch buffer for inobt records. */
159 	size = iwag->sz_recs * sizeof(struct xfs_inobt_rec_incore);
160 	iwag->recs = kmem_alloc(size, KM_MAYFAIL);
161 	if (iwag->recs == NULL)
162 		return -ENOMEM;
163 
164 	return 0;
165 }
166 
167 /* Free memory we allocated for a walk. */
168 STATIC void
169 xfs_iwalk_free(
170 	struct xfs_iwalk_ag	*iwag)
171 {
172 	kmem_free(iwag->recs);
173 	iwag->recs = NULL;
174 }
175 
176 /* For each inuse inode in each cached inobt record, call our function. */
177 STATIC int
178 xfs_iwalk_ag_recs(
179 	struct xfs_iwalk_ag	*iwag)
180 {
181 	struct xfs_mount	*mp = iwag->mp;
182 	struct xfs_trans	*tp = iwag->tp;
183 	struct xfs_perag	*pag = iwag->pag;
184 	xfs_ino_t		ino;
185 	unsigned int		i, j;
186 	int			error;
187 
188 	for (i = 0; i < iwag->nr_recs; i++) {
189 		struct xfs_inobt_rec_incore	*irec = &iwag->recs[i];
190 
191 		trace_xfs_iwalk_ag_rec(mp, pag->pag_agno, irec);
192 
193 		if (xfs_pwork_want_abort(&iwag->pwork))
194 			return 0;
195 
196 		if (iwag->inobt_walk_fn) {
197 			error = iwag->inobt_walk_fn(mp, tp, pag->pag_agno, irec,
198 					iwag->data);
199 			if (error)
200 				return error;
201 		}
202 
203 		if (!iwag->iwalk_fn)
204 			continue;
205 
206 		for (j = 0; j < XFS_INODES_PER_CHUNK; j++) {
207 			if (xfs_pwork_want_abort(&iwag->pwork))
208 				return 0;
209 
210 			/* Skip if this inode is free */
211 			if (XFS_INOBT_MASK(j) & irec->ir_free)
212 				continue;
213 
214 			/* Otherwise call our function. */
215 			ino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
216 						irec->ir_startino + j);
217 			error = iwag->iwalk_fn(mp, tp, ino, iwag->data);
218 			if (error)
219 				return error;
220 		}
221 	}
222 
223 	return 0;
224 }
225 
226 /* Delete cursor and let go of AGI. */
227 static inline void
228 xfs_iwalk_del_inobt(
229 	struct xfs_trans	*tp,
230 	struct xfs_btree_cur	**curpp,
231 	struct xfs_buf		**agi_bpp,
232 	int			error)
233 {
234 	if (*curpp) {
235 		xfs_btree_del_cursor(*curpp, error);
236 		*curpp = NULL;
237 	}
238 	if (*agi_bpp) {
239 		xfs_trans_brelse(tp, *agi_bpp);
240 		*agi_bpp = NULL;
241 	}
242 }
243 
244 /*
245  * Set ourselves up for walking inobt records starting from a given point in
246  * the filesystem.
247  *
248  * If caller passed in a nonzero start inode number, load the record from the
249  * inobt and make the record look like all the inodes before agino are free so
250  * that we skip them, and then move the cursor to the next inobt record.  This
251  * is how we support starting an iwalk in the middle of an inode chunk.
252  *
253  * If the caller passed in a start number of zero, move the cursor to the first
254  * inobt record.
255  *
256  * The caller is responsible for cleaning up the cursor and buffer pointer
257  * regardless of the error status.
258  */
259 STATIC int
260 xfs_iwalk_ag_start(
261 	struct xfs_iwalk_ag	*iwag,
262 	xfs_agino_t		agino,
263 	struct xfs_btree_cur	**curpp,
264 	struct xfs_buf		**agi_bpp,
265 	int			*has_more)
266 {
267 	struct xfs_mount	*mp = iwag->mp;
268 	struct xfs_trans	*tp = iwag->tp;
269 	struct xfs_perag	*pag = iwag->pag;
270 	struct xfs_inobt_rec_incore *irec;
271 	int			error;
272 
273 	/* Set up a fresh cursor and empty the inobt cache. */
274 	iwag->nr_recs = 0;
275 	error = xfs_inobt_cur(mp, tp, pag, XFS_BTNUM_INO, curpp, agi_bpp);
276 	if (error)
277 		return error;
278 
279 	/* Starting at the beginning of the AG?  That's easy! */
280 	if (agino == 0)
281 		return xfs_inobt_lookup(*curpp, 0, XFS_LOOKUP_GE, has_more);
282 
283 	/*
284 	 * Otherwise, we have to grab the inobt record where we left off, stuff
285 	 * the record into our cache, and then see if there are more records.
286 	 * We require a lookup cache of at least two elements so that the
287 	 * caller doesn't have to deal with tearing down the cursor to walk the
288 	 * records.
289 	 */
290 	error = xfs_inobt_lookup(*curpp, agino, XFS_LOOKUP_LE, has_more);
291 	if (error)
292 		return error;
293 
294 	/*
295 	 * If the LE lookup at @agino yields no records, jump ahead to the
296 	 * inobt cursor increment to see if there are more records to process.
297 	 */
298 	if (!*has_more)
299 		goto out_advance;
300 
301 	/* Get the record, should always work */
302 	irec = &iwag->recs[iwag->nr_recs];
303 	error = xfs_inobt_get_rec(*curpp, irec, has_more);
304 	if (error)
305 		return error;
306 	if (XFS_IS_CORRUPT(mp, *has_more != 1))
307 		return -EFSCORRUPTED;
308 
309 	iwag->lastino = XFS_AGINO_TO_INO(mp, pag->pag_agno,
310 				irec->ir_startino + XFS_INODES_PER_CHUNK - 1);
311 
312 	/*
313 	 * If the LE lookup yielded an inobt record before the cursor position,
314 	 * skip it and see if there's another one after it.
315 	 */
316 	if (irec->ir_startino + XFS_INODES_PER_CHUNK <= agino)
317 		goto out_advance;
318 
319 	/*
320 	 * If agino fell in the middle of the inode record, make it look like
321 	 * the inodes up to agino are free so that we don't return them again.
322 	 */
323 	if (iwag->trim_start)
324 		xfs_iwalk_adjust_start(agino, irec);
325 
326 	/*
327 	 * The prefetch calculation is supposed to give us a large enough inobt
328 	 * record cache that grab_ichunk can stage a partial first record and
329 	 * the loop body can cache a record without having to check for cache
330 	 * space until after it reads an inobt record.
331 	 */
332 	iwag->nr_recs++;
333 	ASSERT(iwag->nr_recs < iwag->sz_recs);
334 
335 out_advance:
336 	return xfs_btree_increment(*curpp, 0, has_more);
337 }
338 
339 /*
340  * The inobt record cache is full, so preserve the inobt cursor state and
341  * run callbacks on the cached inobt records.  When we're done, restore the
342  * cursor state to wherever the cursor would have been had the cache not been
343  * full (and therefore we could've just incremented the cursor) if *@has_more
344  * is true.  On exit, *@has_more will indicate whether or not the caller should
345  * try for more inode records.
346  */
347 STATIC int
348 xfs_iwalk_run_callbacks(
349 	struct xfs_iwalk_ag		*iwag,
350 	struct xfs_btree_cur		**curpp,
351 	struct xfs_buf			**agi_bpp,
352 	int				*has_more)
353 {
354 	struct xfs_mount		*mp = iwag->mp;
355 	struct xfs_trans		*tp = iwag->tp;
356 	struct xfs_inobt_rec_incore	*irec;
357 	xfs_agino_t			next_agino;
358 	int				error;
359 
360 	next_agino = XFS_INO_TO_AGINO(mp, iwag->lastino) + 1;
361 
362 	ASSERT(iwag->nr_recs > 0);
363 
364 	/* Delete cursor but remember the last record we cached... */
365 	xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
366 	irec = &iwag->recs[iwag->nr_recs - 1];
367 	ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK);
368 
369 	error = xfs_iwalk_ag_recs(iwag);
370 	if (error)
371 		return error;
372 
373 	/* ...empty the cache... */
374 	iwag->nr_recs = 0;
375 
376 	if (!has_more)
377 		return 0;
378 
379 	/* ...and recreate the cursor just past where we left off. */
380 	error = xfs_inobt_cur(mp, tp, iwag->pag, XFS_BTNUM_INO, curpp, agi_bpp);
381 	if (error)
382 		return error;
383 
384 	return xfs_inobt_lookup(*curpp, next_agino, XFS_LOOKUP_GE, has_more);
385 }
386 
387 /* Walk all inodes in a single AG, from @iwag->startino to the end of the AG. */
388 STATIC int
389 xfs_iwalk_ag(
390 	struct xfs_iwalk_ag		*iwag)
391 {
392 	struct xfs_mount		*mp = iwag->mp;
393 	struct xfs_trans		*tp = iwag->tp;
394 	struct xfs_perag		*pag = iwag->pag;
395 	struct xfs_buf			*agi_bp = NULL;
396 	struct xfs_btree_cur		*cur = NULL;
397 	xfs_agino_t			agino;
398 	int				has_more;
399 	int				error = 0;
400 
401 	/* Set up our cursor at the right place in the inode btree. */
402 	ASSERT(pag->pag_agno == XFS_INO_TO_AGNO(mp, iwag->startino));
403 	agino = XFS_INO_TO_AGINO(mp, iwag->startino);
404 	error = xfs_iwalk_ag_start(iwag, agino, &cur, &agi_bp, &has_more);
405 
406 	while (!error && has_more) {
407 		struct xfs_inobt_rec_incore	*irec;
408 		xfs_ino_t			rec_fsino;
409 
410 		cond_resched();
411 		if (xfs_pwork_want_abort(&iwag->pwork))
412 			goto out;
413 
414 		/* Fetch the inobt record. */
415 		irec = &iwag->recs[iwag->nr_recs];
416 		error = xfs_inobt_get_rec(cur, irec, &has_more);
417 		if (error || !has_more)
418 			break;
419 
420 		/* Make sure that we always move forward. */
421 		rec_fsino = XFS_AGINO_TO_INO(mp, pag->pag_agno, irec->ir_startino);
422 		if (iwag->lastino != NULLFSINO &&
423 		    XFS_IS_CORRUPT(mp, iwag->lastino >= rec_fsino)) {
424 			error = -EFSCORRUPTED;
425 			goto out;
426 		}
427 		iwag->lastino = rec_fsino + XFS_INODES_PER_CHUNK - 1;
428 
429 		/* No allocated inodes in this chunk; skip it. */
430 		if (iwag->skip_empty && irec->ir_freecount == irec->ir_count) {
431 			error = xfs_btree_increment(cur, 0, &has_more);
432 			if (error)
433 				break;
434 			continue;
435 		}
436 
437 		/*
438 		 * Start readahead for this inode chunk in anticipation of
439 		 * walking the inodes.
440 		 */
441 		if (iwag->iwalk_fn)
442 			xfs_iwalk_ichunk_ra(mp, pag, irec);
443 
444 		/*
445 		 * If there's space in the buffer for more records, increment
446 		 * the btree cursor and grab more.
447 		 */
448 		if (++iwag->nr_recs < iwag->sz_recs) {
449 			error = xfs_btree_increment(cur, 0, &has_more);
450 			if (error || !has_more)
451 				break;
452 			continue;
453 		}
454 
455 		/*
456 		 * Otherwise, we need to save cursor state and run the callback
457 		 * function on the cached records.  The run_callbacks function
458 		 * is supposed to return a cursor pointing to the record where
459 		 * we would be if we had been able to increment like above.
460 		 */
461 		ASSERT(has_more);
462 		error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more);
463 	}
464 
465 	if (iwag->nr_recs == 0 || error)
466 		goto out;
467 
468 	/* Walk the unprocessed records in the cache. */
469 	error = xfs_iwalk_run_callbacks(iwag, &cur, &agi_bp, &has_more);
470 
471 out:
472 	xfs_iwalk_del_inobt(tp, &cur, &agi_bp, error);
473 	return error;
474 }
475 
476 /*
477  * We experimentally determined that the reduction in ioctl call overhead
478  * diminishes when userspace asks for more than 2048 inodes, so we'll cap
479  * prefetch at this point.
480  */
481 #define IWALK_MAX_INODE_PREFETCH	(2048U)
482 
483 /*
484  * Given the number of inodes to prefetch, set the number of inobt records that
485  * we cache in memory, which controls the number of inodes we try to read
486  * ahead.  Set the maximum if @inodes == 0.
487  */
488 static inline unsigned int
489 xfs_iwalk_prefetch(
490 	unsigned int		inodes)
491 {
492 	unsigned int		inobt_records;
493 
494 	/*
495 	 * If the caller didn't tell us the number of inodes they wanted,
496 	 * assume the maximum prefetch possible for best performance.
497 	 * Otherwise, cap prefetch at that maximum so that we don't start an
498 	 * absurd amount of prefetch.
499 	 */
500 	if (inodes == 0)
501 		inodes = IWALK_MAX_INODE_PREFETCH;
502 	inodes = min(inodes, IWALK_MAX_INODE_PREFETCH);
503 
504 	/* Round the inode count up to a full chunk. */
505 	inodes = round_up(inodes, XFS_INODES_PER_CHUNK);
506 
507 	/*
508 	 * In order to convert the number of inodes to prefetch into an
509 	 * estimate of the number of inobt records to cache, we require a
510 	 * conversion factor that reflects our expectations of the average
511 	 * loading factor of an inode chunk.  Based on data gathered, most
512 	 * (but not all) filesystems manage to keep the inode chunks totally
513 	 * full, so we'll underestimate slightly so that our readahead will
514 	 * still deliver the performance we want on aging filesystems:
515 	 *
516 	 * inobt = inodes / (INODES_PER_CHUNK * (4 / 5));
517 	 *
518 	 * The funny math is to avoid integer division.
519 	 */
520 	inobt_records = (inodes * 5) / (4 * XFS_INODES_PER_CHUNK);
521 
522 	/*
523 	 * Allocate enough space to prefetch at least two inobt records so that
524 	 * we can cache both the record where the iwalk started and the next
525 	 * record.  This simplifies the AG inode walk loop setup code.
526 	 */
527 	return max(inobt_records, 2U);
528 }
529 
530 /*
531  * Walk all inodes in the filesystem starting from @startino.  The @iwalk_fn
532  * will be called for each allocated inode, being passed the inode's number and
533  * @data.  @max_prefetch controls how many inobt records' worth of inodes we
534  * try to readahead.
535  */
536 int
537 xfs_iwalk(
538 	struct xfs_mount	*mp,
539 	struct xfs_trans	*tp,
540 	xfs_ino_t		startino,
541 	unsigned int		flags,
542 	xfs_iwalk_fn		iwalk_fn,
543 	unsigned int		inode_records,
544 	void			*data)
545 {
546 	struct xfs_iwalk_ag	iwag = {
547 		.mp		= mp,
548 		.tp		= tp,
549 		.iwalk_fn	= iwalk_fn,
550 		.data		= data,
551 		.startino	= startino,
552 		.sz_recs	= xfs_iwalk_prefetch(inode_records),
553 		.trim_start	= 1,
554 		.skip_empty	= 1,
555 		.pwork		= XFS_PWORK_SINGLE_THREADED,
556 		.lastino	= NULLFSINO,
557 	};
558 	struct xfs_perag	*pag;
559 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
560 	int			error;
561 
562 	ASSERT(agno < mp->m_sb.sb_agcount);
563 	ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
564 
565 	error = xfs_iwalk_alloc(&iwag);
566 	if (error)
567 		return error;
568 
569 	for_each_perag_from(mp, agno, pag) {
570 		iwag.pag = pag;
571 		error = xfs_iwalk_ag(&iwag);
572 		if (error)
573 			break;
574 		iwag.startino = XFS_AGINO_TO_INO(mp, agno + 1, 0);
575 		if (flags & XFS_INOBT_WALK_SAME_AG)
576 			break;
577 		iwag.pag = NULL;
578 	}
579 
580 	if (iwag.pag)
581 		xfs_perag_put(pag);
582 	xfs_iwalk_free(&iwag);
583 	return error;
584 }
585 
586 /* Run per-thread iwalk work. */
587 static int
588 xfs_iwalk_ag_work(
589 	struct xfs_mount	*mp,
590 	struct xfs_pwork	*pwork)
591 {
592 	struct xfs_iwalk_ag	*iwag;
593 	int			error = 0;
594 
595 	iwag = container_of(pwork, struct xfs_iwalk_ag, pwork);
596 	if (xfs_pwork_want_abort(pwork))
597 		goto out;
598 
599 	error = xfs_iwalk_alloc(iwag);
600 	if (error)
601 		goto out;
602 
603 	error = xfs_iwalk_ag(iwag);
604 	xfs_iwalk_free(iwag);
605 out:
606 	xfs_perag_put(iwag->pag);
607 	kmem_free(iwag);
608 	return error;
609 }
610 
611 /*
612  * Walk all the inodes in the filesystem using multiple threads to process each
613  * AG.
614  */
615 int
616 xfs_iwalk_threaded(
617 	struct xfs_mount	*mp,
618 	xfs_ino_t		startino,
619 	unsigned int		flags,
620 	xfs_iwalk_fn		iwalk_fn,
621 	unsigned int		inode_records,
622 	bool			polled,
623 	void			*data)
624 {
625 	struct xfs_pwork_ctl	pctl;
626 	struct xfs_perag	*pag;
627 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
628 	int			error;
629 
630 	ASSERT(agno < mp->m_sb.sb_agcount);
631 	ASSERT(!(flags & ~XFS_IWALK_FLAGS_ALL));
632 
633 	error = xfs_pwork_init(mp, &pctl, xfs_iwalk_ag_work, "xfs_iwalk");
634 	if (error)
635 		return error;
636 
637 	for_each_perag_from(mp, agno, pag) {
638 		struct xfs_iwalk_ag	*iwag;
639 
640 		if (xfs_pwork_ctl_want_abort(&pctl))
641 			break;
642 
643 		iwag = kmem_zalloc(sizeof(struct xfs_iwalk_ag), 0);
644 		iwag->mp = mp;
645 
646 		/*
647 		 * perag is being handed off to async work, so take another
648 		 * reference for the async work to release.
649 		 */
650 		atomic_inc(&pag->pag_ref);
651 		iwag->pag = pag;
652 		iwag->iwalk_fn = iwalk_fn;
653 		iwag->data = data;
654 		iwag->startino = startino;
655 		iwag->sz_recs = xfs_iwalk_prefetch(inode_records);
656 		iwag->lastino = NULLFSINO;
657 		xfs_pwork_queue(&pctl, &iwag->pwork);
658 		startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0);
659 		if (flags & XFS_INOBT_WALK_SAME_AG)
660 			break;
661 	}
662 	if (pag)
663 		xfs_perag_put(pag);
664 	if (polled)
665 		xfs_pwork_poll(&pctl);
666 	return xfs_pwork_destroy(&pctl);
667 }
668 
669 /*
670  * Allow callers to cache up to a page's worth of inobt records.  This reflects
671  * the existing inumbers prefetching behavior.  Since the inobt walk does not
672  * itself do anything with the inobt records, we can set a fairly high limit
673  * here.
674  */
675 #define MAX_INOBT_WALK_PREFETCH	\
676 	(PAGE_SIZE / sizeof(struct xfs_inobt_rec_incore))
677 
678 /*
679  * Given the number of records that the user wanted, set the number of inobt
680  * records that we buffer in memory.  Set the maximum if @inobt_records == 0.
681  */
682 static inline unsigned int
683 xfs_inobt_walk_prefetch(
684 	unsigned int		inobt_records)
685 {
686 	/*
687 	 * If the caller didn't tell us the number of inobt records they
688 	 * wanted, assume the maximum prefetch possible for best performance.
689 	 */
690 	if (inobt_records == 0)
691 		inobt_records = MAX_INOBT_WALK_PREFETCH;
692 
693 	/*
694 	 * Allocate enough space to prefetch at least two inobt records so that
695 	 * we can cache both the record where the iwalk started and the next
696 	 * record.  This simplifies the AG inode walk loop setup code.
697 	 */
698 	inobt_records = max(inobt_records, 2U);
699 
700 	/*
701 	 * Cap prefetch at that maximum so that we don't use an absurd amount
702 	 * of memory.
703 	 */
704 	return min_t(unsigned int, inobt_records, MAX_INOBT_WALK_PREFETCH);
705 }
706 
707 /*
708  * Walk all inode btree records in the filesystem starting from @startino.  The
709  * @inobt_walk_fn will be called for each btree record, being passed the incore
710  * record and @data.  @max_prefetch controls how many inobt records we try to
711  * cache ahead of time.
712  */
713 int
714 xfs_inobt_walk(
715 	struct xfs_mount	*mp,
716 	struct xfs_trans	*tp,
717 	xfs_ino_t		startino,
718 	unsigned int		flags,
719 	xfs_inobt_walk_fn	inobt_walk_fn,
720 	unsigned int		inobt_records,
721 	void			*data)
722 {
723 	struct xfs_iwalk_ag	iwag = {
724 		.mp		= mp,
725 		.tp		= tp,
726 		.inobt_walk_fn	= inobt_walk_fn,
727 		.data		= data,
728 		.startino	= startino,
729 		.sz_recs	= xfs_inobt_walk_prefetch(inobt_records),
730 		.pwork		= XFS_PWORK_SINGLE_THREADED,
731 		.lastino	= NULLFSINO,
732 	};
733 	struct xfs_perag	*pag;
734 	xfs_agnumber_t		agno = XFS_INO_TO_AGNO(mp, startino);
735 	int			error;
736 
737 	ASSERT(agno < mp->m_sb.sb_agcount);
738 	ASSERT(!(flags & ~XFS_INOBT_WALK_FLAGS_ALL));
739 
740 	error = xfs_iwalk_alloc(&iwag);
741 	if (error)
742 		return error;
743 
744 	for_each_perag_from(mp, agno, pag) {
745 		iwag.pag = pag;
746 		error = xfs_iwalk_ag(&iwag);
747 		if (error)
748 			break;
749 		iwag.startino = XFS_AGINO_TO_INO(mp, pag->pag_agno + 1, 0);
750 		if (flags & XFS_INOBT_WALK_SAME_AG)
751 			break;
752 		iwag.pag = NULL;
753 	}
754 
755 	if (iwag.pag)
756 		xfs_perag_put(pag);
757 	xfs_iwalk_free(&iwag);
758 	return error;
759 }
760