xref: /linux/fs/xfs/scrub/btree.c (revision 6f7e6393d1ce636bb7ec77a7fe7b77458fddf701)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs_platform.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_inode.h"
13 #include "xfs_btree.h"
14 #include "scrub/scrub.h"
15 #include "scrub/common.h"
16 #include "scrub/btree.h"
17 #include "scrub/trace.h"
18 
19 /* btree scrubbing */
20 
21 /*
22  * Check for btree operation errors.  See the section about handling
23  * operational errors in common.c.
24  */
25 static bool
26 __xchk_btree_process_error(
27 	struct xfs_scrub	*sc,
28 	struct xfs_btree_cur	*cur,
29 	int			level,
30 	int			*error,
31 	__u32			errflag,
32 	void			*ret_ip)
33 {
34 	if (*error == 0)
35 		return true;
36 
37 	switch (*error) {
38 	case -EDEADLOCK:
39 	case -ECHRNG:
40 		/* Used to restart an op with deadlock avoidance. */
41 		trace_xchk_deadlock_retry(sc->ip, sc->sm, *error);
42 		break;
43 	case -EFSBADCRC:
44 	case -EFSCORRUPTED:
45 	case -EIO:
46 	case -ENODATA:
47 		/* Note the badness but don't abort. */
48 		sc->sm->sm_flags |= errflag;
49 		*error = 0;
50 		fallthrough;
51 	default:
52 		if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
53 			trace_xchk_ifork_btree_op_error(sc, cur, level,
54 					*error, ret_ip);
55 		else
56 			trace_xchk_btree_op_error(sc, cur, level,
57 					*error, ret_ip);
58 		break;
59 	}
60 	return false;
61 }
62 
63 bool
64 xchk_btree_process_error(
65 	struct xfs_scrub	*sc,
66 	struct xfs_btree_cur	*cur,
67 	int			level,
68 	int			*error)
69 {
70 	return __xchk_btree_process_error(sc, cur, level, error,
71 			XFS_SCRUB_OFLAG_CORRUPT, __return_address);
72 }
73 
74 bool
75 xchk_btree_xref_process_error(
76 	struct xfs_scrub	*sc,
77 	struct xfs_btree_cur	*cur,
78 	int			level,
79 	int			*error)
80 {
81 	return __xchk_btree_process_error(sc, cur, level, error,
82 			XFS_SCRUB_OFLAG_XFAIL, __return_address);
83 }
84 
85 /* Record btree block corruption. */
86 static void
87 __xchk_btree_set_corrupt(
88 	struct xfs_scrub	*sc,
89 	struct xfs_btree_cur	*cur,
90 	int			level,
91 	__u32			errflag,
92 	void			*ret_ip)
93 {
94 	sc->sm->sm_flags |= errflag;
95 
96 	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
97 		trace_xchk_ifork_btree_error(sc, cur, level,
98 				ret_ip);
99 	else
100 		trace_xchk_btree_error(sc, cur, level,
101 				ret_ip);
102 }
103 
104 void
105 xchk_btree_set_corrupt(
106 	struct xfs_scrub	*sc,
107 	struct xfs_btree_cur	*cur,
108 	int			level)
109 {
110 	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_CORRUPT,
111 			__return_address);
112 }
113 
114 void
115 xchk_btree_xref_set_corrupt(
116 	struct xfs_scrub	*sc,
117 	struct xfs_btree_cur	*cur,
118 	int			level)
119 {
120 	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_XCORRUPT,
121 			__return_address);
122 }
123 
124 void
125 xchk_btree_set_preen(
126 	struct xfs_scrub	*sc,
127 	struct xfs_btree_cur	*cur,
128 	int			level)
129 {
130 	__xchk_btree_set_corrupt(sc, cur, level, XFS_SCRUB_OFLAG_PREEN,
131 			__return_address);
132 }
133 
134 /*
135  * Make sure this record is in order and doesn't stray outside of the parent
136  * keys.
137  */
138 STATIC void
139 xchk_btree_rec(
140 	struct xchk_btree	*bs)
141 {
142 	struct xfs_btree_cur	*cur = bs->cur;
143 	union xfs_btree_rec	*rec;
144 	union xfs_btree_key	key;
145 	union xfs_btree_key	hkey;
146 	union xfs_btree_key	*keyp;
147 	struct xfs_btree_block	*block;
148 	struct xfs_btree_block	*keyblock;
149 	struct xfs_buf		*bp;
150 
151 	block = xfs_btree_get_block(cur, 0, &bp);
152 	rec = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr, block);
153 
154 	trace_xchk_btree_rec(bs->sc, cur, 0);
155 
156 	/* Are all records across all record blocks in order? */
157 	if (bs->lastrec_valid &&
158 	    !cur->bc_ops->recs_inorder(cur, &bs->lastrec, rec))
159 		xchk_btree_set_corrupt(bs->sc, cur, 0);
160 	memcpy(&bs->lastrec, rec, cur->bc_ops->rec_len);
161 	bs->lastrec_valid = true;
162 
163 	if (cur->bc_nlevels == 1)
164 		return;
165 
166 	/* Is low_key(rec) at least as large as the parent low key? */
167 	cur->bc_ops->init_key_from_rec(&key, rec);
168 	keyblock = xfs_btree_get_block(cur, 1, &bp);
169 	keyp = xfs_btree_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
170 	if (xfs_btree_keycmp_lt(cur, &key, keyp))
171 		xchk_btree_set_corrupt(bs->sc, cur, 1);
172 
173 	if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
174 		return;
175 
176 	/* Is high_key(rec) no larger than the parent high key? */
177 	cur->bc_ops->init_high_key_from_rec(&hkey, rec);
178 	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[1].ptr, keyblock);
179 	if (xfs_btree_keycmp_lt(cur, keyp, &hkey))
180 		xchk_btree_set_corrupt(bs->sc, cur, 1);
181 }
182 
183 /*
184  * Make sure this key is in order and doesn't stray outside of the parent
185  * keys.
186  */
187 STATIC void
188 xchk_btree_key(
189 	struct xchk_btree	*bs,
190 	int			level)
191 {
192 	struct xfs_btree_cur	*cur = bs->cur;
193 	union xfs_btree_key	*key;
194 	union xfs_btree_key	*keyp;
195 	struct xfs_btree_block	*block;
196 	struct xfs_btree_block	*keyblock;
197 	struct xfs_buf		*bp;
198 
199 	block = xfs_btree_get_block(cur, level, &bp);
200 	key = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
201 
202 	trace_xchk_btree_key(bs->sc, cur, level);
203 
204 	/* Are all low keys across all node blocks in order? */
205 	if (bs->lastkey[level - 1].valid &&
206 	    !cur->bc_ops->keys_inorder(cur, &bs->lastkey[level - 1].key, key))
207 		xchk_btree_set_corrupt(bs->sc, cur, level);
208 	memcpy(&bs->lastkey[level - 1].key, key, cur->bc_ops->key_len);
209 	bs->lastkey[level - 1].valid = true;
210 
211 	if (level + 1 >= cur->bc_nlevels)
212 		return;
213 
214 	/* Is this block's low key at least as large as the parent low key? */
215 	keyblock = xfs_btree_get_block(cur, level + 1, &bp);
216 	keyp = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr, keyblock);
217 	if (xfs_btree_keycmp_lt(cur, key, keyp))
218 		xchk_btree_set_corrupt(bs->sc, cur, level);
219 
220 	if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
221 		return;
222 
223 	/* Is this block's high key no larger than the parent high key? */
224 	key = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr, block);
225 	keyp = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
226 			keyblock);
227 	if (xfs_btree_keycmp_lt(cur, keyp, key))
228 		xchk_btree_set_corrupt(bs->sc, cur, level);
229 }
230 
231 /*
232  * Check a btree pointer.  Returns true if it's ok to use this pointer.
233  * Callers do not need to set the corrupt flag.
234  */
235 static bool
236 xchk_btree_ptr_ok(
237 	struct xchk_btree	*bs,
238 	int			level,
239 	union xfs_btree_ptr	*ptr)
240 {
241 	/* A btree rooted in an inode has no block pointer to the root. */
242 	if (bs->cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
243 	    level == bs->cur->bc_nlevels)
244 		return true;
245 
246 	/* Otherwise, check the pointers. */
247 	if (__xfs_btree_check_ptr(bs->cur, ptr, 0, level)) {
248 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
249 		return false;
250 	}
251 
252 	return true;
253 }
254 
255 /* Check that a btree block's sibling matches what we expect it. */
256 STATIC int
257 xchk_btree_block_check_sibling(
258 	struct xchk_btree	*bs,
259 	int			level,
260 	int			direction,
261 	union xfs_btree_ptr	*sibling)
262 {
263 	struct xfs_btree_cur	*cur = bs->cur;
264 	struct xfs_btree_block	*pblock;
265 	struct xfs_buf		*pbp;
266 	struct xfs_btree_cur	*ncur = NULL;
267 	union xfs_btree_ptr	*pp;
268 	int			success;
269 	int			error;
270 
271 	error = xfs_btree_dup_cursor(cur, &ncur);
272 	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error) ||
273 	    !ncur)
274 		return error;
275 
276 	/*
277 	 * If the pointer is null, we shouldn't be able to move the upper
278 	 * level pointer anywhere.
279 	 */
280 	if (xfs_btree_ptr_is_null(cur, sibling)) {
281 		if (direction > 0)
282 			error = xfs_btree_increment(ncur, level + 1, &success);
283 		else
284 			error = xfs_btree_decrement(ncur, level + 1, &success);
285 		if (error == 0 && success)
286 			xchk_btree_set_corrupt(bs->sc, cur, level);
287 		error = 0;
288 		goto out;
289 	}
290 
291 	/* Increment upper level pointer. */
292 	if (direction > 0)
293 		error = xfs_btree_increment(ncur, level + 1, &success);
294 	else
295 		error = xfs_btree_decrement(ncur, level + 1, &success);
296 	if (!xchk_btree_process_error(bs->sc, cur, level + 1, &error))
297 		goto out;
298 	if (!success) {
299 		xchk_btree_set_corrupt(bs->sc, cur, level + 1);
300 		goto out;
301 	}
302 
303 	/* Compare upper level pointer to sibling pointer. */
304 	pblock = xfs_btree_get_block(ncur, level + 1, &pbp);
305 	pp = xfs_btree_ptr_addr(ncur, ncur->bc_levels[level + 1].ptr, pblock);
306 	if (!xchk_btree_ptr_ok(bs, level + 1, pp))
307 		goto out;
308 	if (pbp)
309 		xchk_buffer_recheck(bs->sc, pbp);
310 
311 	if (xfs_btree_cmp_two_ptrs(cur, pp, sibling))
312 		xchk_btree_set_corrupt(bs->sc, cur, level);
313 out:
314 	xfs_btree_del_cursor(ncur, XFS_BTREE_ERROR);
315 	return error;
316 }
317 
318 /* Check the siblings of a btree block. */
319 STATIC int
320 xchk_btree_block_check_siblings(
321 	struct xchk_btree	*bs,
322 	struct xfs_btree_block	*block)
323 {
324 	struct xfs_btree_cur	*cur = bs->cur;
325 	union xfs_btree_ptr	leftsib;
326 	union xfs_btree_ptr	rightsib;
327 	int			level;
328 	int			error = 0;
329 
330 	xfs_btree_get_sibling(cur, block, &leftsib, XFS_BB_LEFTSIB);
331 	xfs_btree_get_sibling(cur, block, &rightsib, XFS_BB_RIGHTSIB);
332 	level = xfs_btree_get_level(block);
333 
334 	/* Root block should never have siblings. */
335 	if (level == cur->bc_nlevels - 1) {
336 		if (!xfs_btree_ptr_is_null(cur, &leftsib) ||
337 		    !xfs_btree_ptr_is_null(cur, &rightsib))
338 			xchk_btree_set_corrupt(bs->sc, cur, level);
339 		goto out;
340 	}
341 
342 	/*
343 	 * Does the left & right sibling pointers match the adjacent
344 	 * parent level pointers?
345 	 * (These function absorbs error codes for us.)
346 	 */
347 	error = xchk_btree_block_check_sibling(bs, level, -1, &leftsib);
348 	if (error)
349 		return error;
350 	error = xchk_btree_block_check_sibling(bs, level, 1, &rightsib);
351 	if (error)
352 		return error;
353 out:
354 	return error;
355 }
356 
357 struct check_owner {
358 	struct list_head	list;
359 	xfs_daddr_t		daddr;
360 	int			level;
361 };
362 
363 /*
364  * Make sure this btree block isn't in the free list and that there's
365  * an rmap record for it.
366  */
367 STATIC int
368 xchk_btree_check_block_owner(
369 	struct xchk_btree	*bs,
370 	int			level,
371 	xfs_daddr_t		daddr)
372 {
373 	xfs_agnumber_t		agno;
374 	xfs_agblock_t		agbno;
375 	bool			is_bnobt, is_rmapbt;
376 	bool			init_sa;
377 	int			error = 0;
378 
379 	if (!bs->cur)
380 		return 0;
381 
382 	is_bnobt = xfs_btree_is_bno(bs->cur->bc_ops);
383 	is_rmapbt = xfs_btree_is_rmap(bs->cur->bc_ops);
384 	agno = xfs_daddr_to_agno(bs->cur->bc_mp, daddr);
385 	agbno = xfs_daddr_to_agbno(bs->cur->bc_mp, daddr);
386 
387 	/*
388 	 * If the btree being examined is not itself a per-AG btree, initialize
389 	 * sc->sa so that we can check for the presence of an ownership record
390 	 * in the rmap btree for the AG containing the block.
391 	 */
392 	init_sa = bs->cur->bc_ops->type != XFS_BTREE_TYPE_AG;
393 	if (init_sa) {
394 		error = xchk_ag_init_existing(bs->sc, agno, &bs->sc->sa);
395 		if (!xchk_btree_xref_process_error(bs->sc, bs->cur,
396 				level, &error))
397 			goto out_free;
398 	}
399 
400 	xchk_xref_is_used_space(bs->sc, agbno, 1);
401 	/*
402 	 * The bnobt scrubber aliases bs->cur to bs->sc->sa.bno_cur, so we
403 	 * have to nullify it (to shut down further block owner checks) if
404 	 * self-xref encounters problems.
405 	 */
406 	if (!bs->sc->sa.bno_cur && is_bnobt)
407 		bs->cur = NULL;
408 
409 	xchk_xref_is_only_owned_by(bs->sc, agbno, 1, bs->oinfo);
410 	if (!bs->sc->sa.rmap_cur && is_rmapbt)
411 		bs->cur = NULL;
412 
413 out_free:
414 	if (init_sa)
415 		xchk_ag_free(bs->sc, &bs->sc->sa);
416 
417 	return error;
418 }
419 
420 /* Check the owner of a btree block. */
421 STATIC int
422 xchk_btree_check_owner(
423 	struct xchk_btree	*bs,
424 	int			level,
425 	struct xfs_buf		*bp)
426 {
427 	struct xfs_btree_cur	*cur = bs->cur;
428 
429 	/*
430 	 * In theory, xfs_btree_get_block should only give us a null buffer
431 	 * pointer for the root of a root-in-inode btree type, but we need
432 	 * to check defensively here in case the cursor state is also screwed
433 	 * up.
434 	 */
435 	if (bp == NULL) {
436 		if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE)
437 			xchk_btree_set_corrupt(bs->sc, bs->cur, level);
438 		return 0;
439 	}
440 
441 	/*
442 	 * We want to cross-reference each btree block with the bnobt
443 	 * and the rmapbt.  We cannot cross-reference the bnobt or
444 	 * rmapbt while scanning the bnobt or rmapbt, respectively,
445 	 * because we cannot alter the cursor and we'd prefer not to
446 	 * duplicate cursors.  Therefore, save the buffer daddr for
447 	 * later scanning.
448 	 */
449 	if (xfs_btree_is_bno(cur->bc_ops) || xfs_btree_is_rmap(cur->bc_ops)) {
450 		struct check_owner	*co;
451 
452 		co = kmalloc(sizeof(struct check_owner), XCHK_GFP_FLAGS);
453 		if (!co)
454 			return -ENOMEM;
455 
456 		INIT_LIST_HEAD(&co->list);
457 		co->level = level;
458 		co->daddr = xfs_buf_daddr(bp);
459 		list_add_tail(&co->list, &bs->to_check);
460 		return 0;
461 	}
462 
463 	return xchk_btree_check_block_owner(bs, level, xfs_buf_daddr(bp));
464 }
465 
466 /* Decide if we want to check minrecs of a btree block in the inode root. */
467 static inline bool
468 xchk_btree_check_iroot_minrecs(
469 	struct xchk_btree	*bs)
470 {
471 	/*
472 	 * xfs_bmap_add_attrfork_btree had an implementation bug wherein it
473 	 * would miscalculate the space required for the data fork bmbt root
474 	 * when adding an attr fork, and promote the iroot contents to an
475 	 * external block unnecessarily.  This went unnoticed for many years
476 	 * until scrub found filesystems in this state.  Inode rooted btrees are
477 	 * not supposed to have immediate child blocks that are small enough
478 	 * that the contents could fit in the inode root, but we can't fail
479 	 * existing filesystems, so instead we disable the check for data fork
480 	 * bmap btrees when there's an attr fork.
481 	 */
482 	if (xfs_btree_is_bmap(bs->cur->bc_ops) &&
483 	    bs->cur->bc_ino.whichfork == XFS_DATA_FORK &&
484 	    xfs_inode_has_attr_fork(bs->sc->ip))
485 		return false;
486 
487 	return true;
488 }
489 
490 /*
491  * Check that this btree block has at least minrecs records or is one of the
492  * special blocks that don't require that.
493  */
494 STATIC void
495 xchk_btree_check_minrecs(
496 	struct xchk_btree	*bs,
497 	int			level,
498 	struct xfs_btree_block	*block)
499 {
500 	struct xfs_btree_cur	*cur = bs->cur;
501 	unsigned int		root_level = cur->bc_nlevels - 1;
502 	unsigned int		numrecs = be16_to_cpu(block->bb_numrecs);
503 
504 	/* More records than minrecs means the block is ok. */
505 	if (numrecs >= cur->bc_ops->get_minrecs(cur, level))
506 		return;
507 
508 	/*
509 	 * For btrees rooted in the inode, it's possible that the root block
510 	 * contents spilled into a regular ondisk block because there wasn't
511 	 * enough space in the inode root.  The number of records in that
512 	 * child block might be less than the standard minrecs, but that's ok
513 	 * provided that there's only one direct child of the root.
514 	 */
515 	if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE &&
516 	    level == cur->bc_nlevels - 2) {
517 		struct xfs_btree_block	*root_block;
518 		struct xfs_buf		*root_bp;
519 		int			root_maxrecs;
520 
521 		root_block = xfs_btree_get_block(cur, root_level, &root_bp);
522 		root_maxrecs = cur->bc_ops->get_dmaxrecs(cur, root_level);
523 		if (xchk_btree_check_iroot_minrecs(bs) &&
524 		    (be16_to_cpu(root_block->bb_numrecs) != 1 ||
525 		     numrecs <= root_maxrecs))
526 			xchk_btree_set_corrupt(bs->sc, cur, level);
527 		return;
528 	}
529 
530 	/*
531 	 * Otherwise, only the root level is allowed to have fewer than minrecs
532 	 * records or keyptrs.
533 	 */
534 	if (level < root_level)
535 		xchk_btree_set_corrupt(bs->sc, cur, level);
536 }
537 
538 /*
539  * If this btree block has a parent, make sure that the parent's keys capture
540  * the keyspace contained in this block.
541  */
542 STATIC void
543 xchk_btree_block_check_keys(
544 	struct xchk_btree	*bs,
545 	int			level,
546 	struct xfs_btree_block	*block)
547 {
548 	union xfs_btree_key	block_key;
549 	union xfs_btree_key	*block_high_key;
550 	union xfs_btree_key	*parent_low_key, *parent_high_key;
551 	struct xfs_btree_cur	*cur = bs->cur;
552 	struct xfs_btree_block	*parent_block;
553 	struct xfs_buf		*bp;
554 
555 	if (level == cur->bc_nlevels - 1)
556 		return;
557 
558 	xfs_btree_get_keys(cur, block, &block_key);
559 
560 	/* Make sure the low key of this block matches the parent. */
561 	parent_block = xfs_btree_get_block(cur, level + 1, &bp);
562 	parent_low_key = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
563 			parent_block);
564 	if (xfs_btree_keycmp_ne(cur, &block_key, parent_low_key)) {
565 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
566 		return;
567 	}
568 
569 	if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
570 		return;
571 
572 	/* Make sure the high key of this block matches the parent. */
573 	parent_high_key = xfs_btree_high_key_addr(cur,
574 			cur->bc_levels[level + 1].ptr, parent_block);
575 	block_high_key = xfs_btree_high_key_from_key(cur, &block_key);
576 	if (xfs_btree_keycmp_ne(cur, block_high_key, parent_high_key))
577 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
578 }
579 
580 /*
581  * Grab and scrub a btree block given a btree pointer.  Returns block
582  * and buffer pointers (if applicable) if they're ok to use.
583  */
584 STATIC int
585 xchk_btree_get_block(
586 	struct xchk_btree	*bs,
587 	int			level,
588 	union xfs_btree_ptr	*pp,
589 	struct xfs_btree_block	**pblock,
590 	struct xfs_buf		**pbp)
591 {
592 	int			error;
593 
594 	*pblock = NULL;
595 	*pbp = NULL;
596 
597 	error = xfs_btree_lookup_get_block(bs->cur, level, pp, pblock);
598 	if (!xchk_btree_process_error(bs->sc, bs->cur, level, &error) ||
599 	    !*pblock)
600 		return error;
601 
602 	xfs_btree_get_block(bs->cur, level, pbp);
603 	if (__xfs_btree_check_block(bs->cur, *pblock, level, *pbp)) {
604 		xchk_btree_set_corrupt(bs->sc, bs->cur, level);
605 		return 0;
606 	}
607 	if (*pbp)
608 		xchk_buffer_recheck(bs->sc, *pbp);
609 
610 	xchk_btree_check_minrecs(bs, level, *pblock);
611 
612 	/*
613 	 * Check the block's owner; this function absorbs error codes
614 	 * for us.
615 	 */
616 	error = xchk_btree_check_owner(bs, level, *pbp);
617 	if (error)
618 		return error;
619 
620 	/*
621 	 * Check the block's siblings; this function absorbs error codes
622 	 * for us.
623 	 */
624 	error = xchk_btree_block_check_siblings(bs, *pblock);
625 	if (error)
626 		return error;
627 
628 	xchk_btree_block_check_keys(bs, level, *pblock);
629 	return 0;
630 }
631 
632 /*
633  * Check that the low and high keys of this block match the keys stored
634  * in the parent block.
635  */
636 STATIC void
637 xchk_btree_block_keys(
638 	struct xchk_btree	*bs,
639 	int			level,
640 	struct xfs_btree_block	*block)
641 {
642 	union xfs_btree_key	block_keys;
643 	struct xfs_btree_cur	*cur = bs->cur;
644 	union xfs_btree_key	*high_bk;
645 	union xfs_btree_key	*parent_keys;
646 	union xfs_btree_key	*high_pk;
647 	struct xfs_btree_block	*parent_block;
648 	struct xfs_buf		*bp;
649 
650 	if (level >= cur->bc_nlevels - 1)
651 		return;
652 
653 	/* Calculate the keys for this block. */
654 	xfs_btree_get_keys(cur, block, &block_keys);
655 
656 	/* Obtain the parent's copy of the keys for this block. */
657 	parent_block = xfs_btree_get_block(cur, level + 1, &bp);
658 	parent_keys = xfs_btree_key_addr(cur, cur->bc_levels[level + 1].ptr,
659 			parent_block);
660 
661 	if (xfs_btree_keycmp_ne(cur, &block_keys, parent_keys))
662 		xchk_btree_set_corrupt(bs->sc, cur, 1);
663 
664 	if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
665 		return;
666 
667 	/* Get high keys */
668 	high_bk = xfs_btree_high_key_from_key(cur, &block_keys);
669 	high_pk = xfs_btree_high_key_addr(cur, cur->bc_levels[level + 1].ptr,
670 			parent_block);
671 
672 	if (xfs_btree_keycmp_ne(cur, high_bk, high_pk))
673 		xchk_btree_set_corrupt(bs->sc, cur, 1);
674 }
675 
676 /*
677  * Visit all nodes and leaves of a btree.  Check that all pointers and
678  * records are in order, that the keys reflect the records, and use a callback
679  * so that the caller can verify individual records.
680  */
681 int
682 xchk_btree(
683 	struct xfs_scrub		*sc,
684 	struct xfs_btree_cur		*cur,
685 	xchk_btree_rec_fn		scrub_fn,
686 	const struct xfs_owner_info	*oinfo,
687 	void				*private)
688 {
689 	union xfs_btree_ptr		ptr;
690 	struct xchk_btree		*bs;
691 	union xfs_btree_ptr		*pp;
692 	union xfs_btree_rec		*recp;
693 	struct xfs_btree_block		*block;
694 	struct xfs_buf			*bp;
695 	struct check_owner		*co;
696 	struct check_owner		*n;
697 	size_t				cur_sz;
698 	int				level;
699 	int				error = 0;
700 
701 	/*
702 	 * Allocate the btree scrub context from the heap, because this
703 	 * structure can get rather large.  Don't let a caller feed us a
704 	 * totally absurd size.
705 	 */
706 	cur_sz = xchk_btree_sizeof(cur->bc_nlevels);
707 	if (cur_sz > PAGE_SIZE) {
708 		xchk_btree_set_corrupt(sc, cur, 0);
709 		return 0;
710 	}
711 	bs = kzalloc(cur_sz, XCHK_GFP_FLAGS);
712 	if (!bs)
713 		return -ENOMEM;
714 	bs->cur = cur;
715 	bs->scrub_rec = scrub_fn;
716 	bs->oinfo = oinfo;
717 	bs->private = private;
718 	bs->sc = sc;
719 
720 	/* Initialize scrub state */
721 	INIT_LIST_HEAD(&bs->to_check);
722 
723 	/*
724 	 * Load the root of the btree.  The helper function absorbs
725 	 * error codes for us.
726 	 */
727 	level = cur->bc_nlevels - 1;
728 	xfs_btree_init_ptr_from_cur(cur, &ptr);
729 	if (!xchk_btree_ptr_ok(bs, cur->bc_nlevels, &ptr))
730 		goto out;
731 	error = xchk_btree_get_block(bs, level, &ptr, &block, &bp);
732 	if (error || !block)
733 		goto out;
734 
735 	cur->bc_levels[level].ptr = 1;
736 
737 	while (level < cur->bc_nlevels) {
738 		block = xfs_btree_get_block(cur, level, &bp);
739 
740 		if (level == 0) {
741 			/* End of leaf, pop back towards the root. */
742 			if (cur->bc_levels[level].ptr >
743 			    be16_to_cpu(block->bb_numrecs)) {
744 				xchk_btree_block_keys(bs, level, block);
745 				if (level < cur->bc_nlevels - 1)
746 					cur->bc_levels[level + 1].ptr++;
747 				level++;
748 				continue;
749 			}
750 
751 			/* Records in order for scrub? */
752 			xchk_btree_rec(bs);
753 
754 			/* Call out to the record checker. */
755 			recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
756 					block);
757 			error = bs->scrub_rec(bs, recp);
758 			if (error)
759 				break;
760 			if (xchk_should_terminate(sc, &error) ||
761 			    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT))
762 				break;
763 
764 			cur->bc_levels[level].ptr++;
765 			continue;
766 		}
767 
768 		/* End of node, pop back towards the root. */
769 		if (cur->bc_levels[level].ptr >
770 					be16_to_cpu(block->bb_numrecs)) {
771 			xchk_btree_block_keys(bs, level, block);
772 			if (level < cur->bc_nlevels - 1)
773 				cur->bc_levels[level + 1].ptr++;
774 			level++;
775 			continue;
776 		}
777 
778 		/* Keys in order for scrub? */
779 		xchk_btree_key(bs, level);
780 
781 		/* Drill another level deeper. */
782 		pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
783 		if (!xchk_btree_ptr_ok(bs, level, pp)) {
784 			cur->bc_levels[level].ptr++;
785 			continue;
786 		}
787 		level--;
788 		error = xchk_btree_get_block(bs, level, pp, &block, &bp);
789 		if (error || !block)
790 			goto out;
791 
792 		cur->bc_levels[level].ptr = 1;
793 	}
794 
795 out:
796 	/* Process deferred owner checks on btree blocks. */
797 	list_for_each_entry_safe(co, n, &bs->to_check, list) {
798 		if (!error && bs->cur)
799 			error = xchk_btree_check_block_owner(bs, co->level,
800 					co->daddr);
801 		list_del(&co->list);
802 		kfree(co);
803 	}
804 	kfree(bs);
805 
806 	return error;
807 }
808