xref: /linux/fs/xfs/scrub/dir.c (revision e49a3eac9207e9575337f70feeb29430f6f16bb7)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2017-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_log_format.h"
13 #include "xfs_trans.h"
14 #include "xfs_inode.h"
15 #include "xfs_icache.h"
16 #include "xfs_dir2.h"
17 #include "xfs_dir2_priv.h"
18 #include "xfs_health.h"
19 #include "xfs_attr.h"
20 #include "xfs_parent.h"
21 #include "scrub/scrub.h"
22 #include "scrub/common.h"
23 #include "scrub/dabtree.h"
24 #include "scrub/readdir.h"
25 #include "scrub/health.h"
26 #include "scrub/repair.h"
27 #include "scrub/trace.h"
28 #include "scrub/xfile.h"
29 #include "scrub/xfarray.h"
30 #include "scrub/xfblob.h"
31 
32 /* Set us up to scrub directories. */
33 int
34 xchk_setup_directory(
35 	struct xfs_scrub	*sc)
36 {
37 	int			error;
38 
39 	if (xchk_could_repair(sc)) {
40 		error = xrep_setup_directory(sc);
41 		if (error)
42 			return error;
43 	}
44 
45 	return xchk_setup_inode_contents(sc, 0);
46 }
47 
48 /* Directories */
49 
50 /* Deferred directory entry that we saved for later. */
51 struct xchk_dirent {
52 	/* Cookie for retrieval of the dirent name. */
53 	xfblob_cookie		name_cookie;
54 
55 	/* Child inode number. */
56 	xfs_ino_t		ino;
57 
58 	/* Length of the pptr name. */
59 	uint8_t			namelen;
60 };
61 
62 struct xchk_dir {
63 	struct xfs_scrub	*sc;
64 
65 	/* information for parent pointer validation. */
66 	struct xfs_parent_rec	pptr_rec;
67 	struct xfs_da_args	pptr_args;
68 
69 	/* Fixed-size array of xchk_dirent structures. */
70 	struct xfarray		*dir_entries;
71 
72 	/* Blobs containing dirent names. */
73 	struct xfblob		*dir_names;
74 
75 	/* If we've cycled the ILOCK, we must revalidate deferred dirents. */
76 	bool			need_revalidate;
77 
78 	/* Name buffer for dirent revalidation. */
79 	struct xfs_name		xname;
80 	uint8_t			namebuf[MAXNAMELEN];
81 };
82 
83 /* Scrub a directory entry. */
84 
85 /* Check that an inode's mode matches a given XFS_DIR3_FT_* type. */
86 STATIC void
87 xchk_dir_check_ftype(
88 	struct xfs_scrub	*sc,
89 	xfs_fileoff_t		offset,
90 	struct xfs_inode	*ip,
91 	int			ftype)
92 {
93 	struct xfs_mount	*mp = sc->mp;
94 
95 	if (!xfs_has_ftype(mp)) {
96 		if (ftype != XFS_DIR3_FT_UNKNOWN && ftype != XFS_DIR3_FT_DIR)
97 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
98 		return;
99 	}
100 
101 	if (xfs_mode_to_ftype(VFS_I(ip)->i_mode) != ftype)
102 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
103 
104 	/*
105 	 * Metadata and regular inodes cannot cross trees.  This property
106 	 * cannot change without a full inode free and realloc cycle, so it's
107 	 * safe to check this without holding locks.
108 	 */
109 	if (xfs_is_metadir_inode(ip) != xfs_is_metadir_inode(sc->ip))
110 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
111 }
112 
113 /*
114  * Try to lock a child file for checking parent pointers.  Returns the inode
115  * flags for the locks we now hold, or zero if we failed.
116  */
117 STATIC unsigned int
118 xchk_dir_lock_child(
119 	struct xfs_scrub	*sc,
120 	struct xfs_inode	*ip)
121 {
122 	if (!xfs_ilock_nowait(ip, XFS_IOLOCK_SHARED))
123 		return 0;
124 
125 	if (!xfs_ilock_nowait(ip, XFS_ILOCK_SHARED)) {
126 		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
127 		return 0;
128 	}
129 
130 	if (!xfs_inode_has_attr_fork(ip) || !xfs_need_iread_extents(&ip->i_af))
131 		return XFS_IOLOCK_SHARED | XFS_ILOCK_SHARED;
132 
133 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
134 
135 	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL)) {
136 		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
137 		return 0;
138 	}
139 
140 	return XFS_IOLOCK_SHARED | XFS_ILOCK_EXCL;
141 }
142 
143 /* Check the backwards link (parent pointer) associated with this dirent. */
144 STATIC int
145 xchk_dir_parent_pointer(
146 	struct xchk_dir		*sd,
147 	const struct xfs_name	*name,
148 	struct xfs_inode	*ip)
149 {
150 	struct xfs_scrub	*sc = sd->sc;
151 	int			error;
152 
153 	xfs_inode_to_parent_rec(&sd->pptr_rec, sc->ip);
154 	error = xfs_parent_lookup(sc->tp, ip, name, &sd->pptr_rec,
155 			&sd->pptr_args);
156 	if (error == -ENOATTR)
157 		xchk_fblock_xref_set_corrupt(sc, XFS_DATA_FORK, 0);
158 
159 	return 0;
160 }
161 
162 /* Look for a parent pointer matching this dirent, if the child isn't busy. */
163 STATIC int
164 xchk_dir_check_pptr_fast(
165 	struct xchk_dir		*sd,
166 	xfs_dir2_dataptr_t	dapos,
167 	const struct xfs_name	*name,
168 	struct xfs_inode	*ip)
169 {
170 	struct xfs_scrub	*sc = sd->sc;
171 	unsigned int		lockmode;
172 	int			error;
173 
174 	/* dot and dotdot entries do not have parent pointers */
175 	if (xfs_dir2_samename(name, &xfs_name_dot) ||
176 	    xfs_dir2_samename(name, &xfs_name_dotdot))
177 		return 0;
178 
179 	/* No self-referential non-dot or dotdot dirents. */
180 	if (ip == sc->ip) {
181 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
182 		return -ECANCELED;
183 	}
184 
185 	/* Try to lock the inode. */
186 	lockmode = xchk_dir_lock_child(sc, ip);
187 	if (!lockmode) {
188 		struct xchk_dirent	save_de = {
189 			.namelen	= name->len,
190 			.ino		= ip->i_ino,
191 		};
192 
193 		/* Couldn't lock the inode, so save the dirent for later. */
194 		trace_xchk_dir_defer(sc->ip, name, ip->i_ino);
195 
196 		error = xfblob_storename(sd->dir_names, &save_de.name_cookie,
197 				name);
198 		if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0,
199 					&error))
200 			return error;
201 
202 		error = xfarray_append(sd->dir_entries, &save_de);
203 		if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0,
204 					&error))
205 			return error;
206 
207 		return 0;
208 	}
209 
210 	error = xchk_dir_parent_pointer(sd, name, ip);
211 	xfs_iunlock(ip, lockmode);
212 	return error;
213 }
214 
215 /*
216  * Scrub a single directory entry.
217  *
218  * Check the inode number to make sure it's sane, then we check that we can
219  * look up this filename.  Finally, we check the ftype.
220  */
221 STATIC int
222 xchk_dir_actor(
223 	struct xfs_scrub	*sc,
224 	struct xfs_inode	*dp,
225 	xfs_dir2_dataptr_t	dapos,
226 	const struct xfs_name	*name,
227 	xfs_ino_t		ino,
228 	void			*priv)
229 {
230 	struct xfs_mount	*mp = dp->i_mount;
231 	struct xfs_inode	*ip;
232 	struct xchk_dir		*sd = priv;
233 	xfs_ino_t		lookup_ino;
234 	xfs_dablk_t		offset;
235 	int			error = 0;
236 
237 	offset = xfs_dir2_db_to_da(mp->m_dir_geo,
238 			xfs_dir2_dataptr_to_db(mp->m_dir_geo, dapos));
239 
240 	if (xchk_should_terminate(sc, &error))
241 		return error;
242 
243 	/* Does this inode number make sense? */
244 	if (!xfs_verify_dir_ino(mp, ino)) {
245 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
246 		return -ECANCELED;
247 	}
248 
249 	/* Does this name make sense? */
250 	if (!xfs_dir2_namecheck(name->name, name->len)) {
251 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
252 		return -ECANCELED;
253 	}
254 
255 	if (xfs_dir2_samename(name, &xfs_name_dot)) {
256 		/* If this is "." then check that the inum matches the dir. */
257 		if (ino != dp->i_ino)
258 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
259 	} else if (xfs_dir2_samename(name, &xfs_name_dotdot)) {
260 		/*
261 		 * If this is ".." in the root inode, check that the inum
262 		 * matches this dir.
263 		 */
264 		if (xchk_inode_is_dirtree_root(dp) && ino != dp->i_ino)
265 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
266 	}
267 
268 	/* Verify that we can look up this name by hash. */
269 	error = xchk_dir_lookup(sc, dp, name, &lookup_ino);
270 	/* ENOENT means the hash lookup failed and the dir is corrupt */
271 	if (error == -ENOENT)
272 		error = -EFSCORRUPTED;
273 	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, offset, &error))
274 		goto out;
275 	if (lookup_ino != ino) {
276 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, offset);
277 		return -ECANCELED;
278 	}
279 
280 	/*
281 	 * Grab the inode pointed to by the dirent.  We release the inode
282 	 * before we cancel the scrub transaction.
283 	 *
284 	 * If _iget returns -EINVAL or -ENOENT then the child inode number is
285 	 * garbage and the directory is corrupt.  If the _iget returns
286 	 * -EFSCORRUPTED or -EFSBADCRC then the child is corrupt which is a
287 	 *  cross referencing error.  Any other error is an operational error.
288 	 */
289 	error = xchk_iget(sc, ino, &ip);
290 	if (error == -EINVAL || error == -ENOENT) {
291 		error = -EFSCORRUPTED;
292 		xchk_fblock_process_error(sc, XFS_DATA_FORK, 0, &error);
293 		goto out;
294 	}
295 	if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, offset, &error))
296 		goto out;
297 
298 	xchk_dir_check_ftype(sc, offset, ip, name->type);
299 
300 	if (xfs_has_parent(mp)) {
301 		error = xchk_dir_check_pptr_fast(sd, dapos, name, ip);
302 		if (error)
303 			goto out_rele;
304 	}
305 
306 out_rele:
307 	xchk_irele(sc, ip);
308 out:
309 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
310 		return -ECANCELED;
311 	return error;
312 }
313 
314 /* Scrub a directory btree record. */
315 STATIC int
316 xchk_dir_rec(
317 	struct xchk_da_btree		*ds,
318 	int				level)
319 {
320 	struct xfs_name			dname = { };
321 	struct xfs_da_state_blk		*blk = &ds->state->path.blk[level];
322 	struct xfs_mount		*mp = ds->state->mp;
323 	struct xfs_inode		*dp = ds->dargs.dp;
324 	struct xfs_da_geometry		*geo = mp->m_dir_geo;
325 	struct xfs_dir2_data_entry	*dent;
326 	struct xfs_buf			*bp;
327 	struct xfs_dir2_leaf_entry	*ent;
328 	unsigned int			end;
329 	unsigned int			iter_off;
330 	xfs_ino_t			ino;
331 	xfs_dablk_t			rec_bno;
332 	xfs_dir2_db_t			db;
333 	xfs_dir2_data_aoff_t		off;
334 	xfs_dir2_dataptr_t		ptr;
335 	xfs_dahash_t			calc_hash;
336 	xfs_dahash_t			hash;
337 	struct xfs_dir3_icleaf_hdr	hdr;
338 	unsigned int			tag;
339 	int				error;
340 
341 	ASSERT(blk->magic == XFS_DIR2_LEAF1_MAGIC ||
342 	       blk->magic == XFS_DIR2_LEAFN_MAGIC);
343 
344 	xfs_dir2_leaf_hdr_from_disk(mp, &hdr, blk->bp->b_addr);
345 	ent = hdr.ents + blk->index;
346 
347 	/* Check the hash of the entry. */
348 	error = xchk_da_btree_hash(ds, level, &ent->hashval);
349 	if (error)
350 		goto out;
351 
352 	/* Valid hash pointer? */
353 	ptr = be32_to_cpu(ent->address);
354 	if (ptr == 0)
355 		return 0;
356 
357 	/* Find the directory entry's location. */
358 	db = xfs_dir2_dataptr_to_db(geo, ptr);
359 	off = xfs_dir2_dataptr_to_off(geo, ptr);
360 	rec_bno = xfs_dir2_db_to_da(geo, db);
361 
362 	if (rec_bno >= geo->leafblk) {
363 		xchk_da_set_corrupt(ds, level);
364 		goto out;
365 	}
366 	error = xfs_dir3_data_read(ds->dargs.trans, dp, ds->dargs.owner,
367 			rec_bno, XFS_DABUF_MAP_HOLE_OK, &bp);
368 	if (!xchk_fblock_process_error(ds->sc, XFS_DATA_FORK, rec_bno,
369 			&error))
370 		goto out;
371 	if (!bp) {
372 		xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
373 		goto out;
374 	}
375 	xchk_buffer_recheck(ds->sc, bp);
376 
377 	if (ds->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
378 		goto out_relse;
379 
380 	dent = bp->b_addr + off;
381 
382 	/* Make sure we got a real directory entry. */
383 	iter_off = geo->data_entry_offset;
384 	end = xfs_dir3_data_end_offset(geo, bp->b_addr);
385 	if (!end) {
386 		xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
387 		goto out_relse;
388 	}
389 	for (;;) {
390 		struct xfs_dir2_data_entry	*dep = bp->b_addr + iter_off;
391 		struct xfs_dir2_data_unused	*dup = bp->b_addr + iter_off;
392 
393 		if (iter_off >= end) {
394 			xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
395 			goto out_relse;
396 		}
397 
398 		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
399 			iter_off += be16_to_cpu(dup->length);
400 			continue;
401 		}
402 		if (dep == dent)
403 			break;
404 		iter_off += xfs_dir2_data_entsize(mp, dep->namelen);
405 	}
406 
407 	/* Retrieve the entry, sanity check it, and compare hashes. */
408 	ino = be64_to_cpu(dent->inumber);
409 	hash = be32_to_cpu(ent->hashval);
410 	tag = be16_to_cpup(xfs_dir2_data_entry_tag_p(mp, dent));
411 	if (!xfs_verify_dir_ino(mp, ino) || tag != off)
412 		xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
413 	if (dent->namelen == 0) {
414 		xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
415 		goto out_relse;
416 	}
417 
418 	/* Does the directory hash match? */
419 	dname.name = dent->name;
420 	dname.len = dent->namelen;
421 	calc_hash = xfs_dir2_hashname(mp, &dname);
422 	if (calc_hash != hash)
423 		xchk_fblock_set_corrupt(ds->sc, XFS_DATA_FORK, rec_bno);
424 
425 out_relse:
426 	xfs_trans_brelse(ds->dargs.trans, bp);
427 out:
428 	return error;
429 }
430 
431 /*
432  * Is this unused entry either in the bestfree or smaller than all of
433  * them?  We've already checked that the bestfrees are sorted longest to
434  * shortest, and that there aren't any bogus entries.
435  */
436 STATIC void
437 xchk_directory_check_free_entry(
438 	struct xfs_scrub		*sc,
439 	xfs_dablk_t			lblk,
440 	struct xfs_dir2_data_free	*bf,
441 	struct xfs_dir2_data_unused	*dup)
442 {
443 	struct xfs_dir2_data_free	*dfp;
444 	unsigned int			dup_length;
445 
446 	dup_length = be16_to_cpu(dup->length);
447 
448 	/* Unused entry is shorter than any of the bestfrees */
449 	if (dup_length < be16_to_cpu(bf[XFS_DIR2_DATA_FD_COUNT - 1].length))
450 		return;
451 
452 	for (dfp = &bf[XFS_DIR2_DATA_FD_COUNT - 1]; dfp >= bf; dfp--)
453 		if (dup_length == be16_to_cpu(dfp->length))
454 			return;
455 
456 	/* Unused entry should be in the bestfrees but wasn't found. */
457 	xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
458 }
459 
460 /* Check free space info in a directory data block. */
461 STATIC int
462 xchk_directory_data_bestfree(
463 	struct xfs_scrub		*sc,
464 	xfs_dablk_t			lblk,
465 	bool				is_block)
466 {
467 	struct xfs_dir2_data_unused	*dup;
468 	struct xfs_dir2_data_free	*dfp;
469 	struct xfs_buf			*bp;
470 	struct xfs_dir2_data_free	*bf;
471 	struct xfs_mount		*mp = sc->mp;
472 	u16				tag;
473 	unsigned int			nr_bestfrees = 0;
474 	unsigned int			nr_frees = 0;
475 	unsigned int			smallest_bestfree;
476 	int				newlen;
477 	unsigned int			offset;
478 	unsigned int			end;
479 	int				error;
480 
481 	if (is_block) {
482 		/* dir block format */
483 		if (lblk != XFS_B_TO_FSBT(mp, XFS_DIR2_DATA_OFFSET))
484 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
485 		error = xfs_dir3_block_read(sc->tp, sc->ip, sc->ip->i_ino, &bp);
486 	} else {
487 		/* dir data format */
488 		error = xfs_dir3_data_read(sc->tp, sc->ip, sc->ip->i_ino, lblk,
489 				0, &bp);
490 	}
491 	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
492 		goto out;
493 	xchk_buffer_recheck(sc, bp);
494 
495 	/* XXX: Check xfs_dir3_data_hdr.pad is zero once we start setting it. */
496 
497 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
498 		goto out_buf;
499 
500 	/* Do the bestfrees correspond to actual free space? */
501 	bf = xfs_dir2_data_bestfree_p(mp, bp->b_addr);
502 	smallest_bestfree = UINT_MAX;
503 	for (dfp = &bf[0]; dfp < &bf[XFS_DIR2_DATA_FD_COUNT]; dfp++) {
504 		offset = be16_to_cpu(dfp->offset);
505 		if (offset == 0)
506 			continue;
507 		if (offset >= mp->m_dir_geo->blksize) {
508 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
509 			goto out_buf;
510 		}
511 		dup = bp->b_addr + offset;
512 		tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup));
513 
514 		/* bestfree doesn't match the entry it points at? */
515 		if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG) ||
516 		    be16_to_cpu(dup->length) != be16_to_cpu(dfp->length) ||
517 		    tag != offset) {
518 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
519 			goto out_buf;
520 		}
521 
522 		/* bestfree records should be ordered largest to smallest */
523 		if (smallest_bestfree < be16_to_cpu(dfp->length)) {
524 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
525 			goto out_buf;
526 		}
527 
528 		smallest_bestfree = be16_to_cpu(dfp->length);
529 		nr_bestfrees++;
530 	}
531 
532 	/* Make sure the bestfrees are actually the best free spaces. */
533 	offset = mp->m_dir_geo->data_entry_offset;
534 	end = xfs_dir3_data_end_offset(mp->m_dir_geo, bp->b_addr);
535 
536 	/* Iterate the entries, stopping when we hit or go past the end. */
537 	while (offset < end) {
538 		dup = bp->b_addr + offset;
539 
540 		/* Skip real entries */
541 		if (dup->freetag != cpu_to_be16(XFS_DIR2_DATA_FREE_TAG)) {
542 			struct xfs_dir2_data_entry *dep = bp->b_addr + offset;
543 
544 			newlen = xfs_dir2_data_entsize(mp, dep->namelen);
545 			if (newlen <= 0) {
546 				xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
547 						lblk);
548 				goto out_buf;
549 			}
550 			offset += newlen;
551 			continue;
552 		}
553 
554 		/* Spot check this free entry */
555 		tag = be16_to_cpu(*xfs_dir2_data_unused_tag_p(dup));
556 		if (tag != offset) {
557 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
558 			goto out_buf;
559 		}
560 
561 		/*
562 		 * Either this entry is a bestfree or it's smaller than
563 		 * any of the bestfrees.
564 		 */
565 		xchk_directory_check_free_entry(sc, lblk, bf, dup);
566 		if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
567 			goto out_buf;
568 
569 		/* Move on. */
570 		newlen = be16_to_cpu(dup->length);
571 		if (newlen <= 0) {
572 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
573 			goto out_buf;
574 		}
575 		offset += newlen;
576 		if (offset <= end)
577 			nr_frees++;
578 	}
579 
580 	/* We're required to fill all the space. */
581 	if (offset != end)
582 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
583 
584 	/* Did we see at least as many free slots as there are bestfrees? */
585 	if (nr_frees < nr_bestfrees)
586 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
587 out_buf:
588 	xfs_trans_brelse(sc->tp, bp);
589 out:
590 	return error;
591 }
592 
593 /*
594  * Does the free space length in the free space index block ($len) match
595  * the longest length in the directory data block's bestfree array?
596  * Assume that we've already checked that the data block's bestfree
597  * array is in order.
598  */
599 STATIC void
600 xchk_directory_check_freesp(
601 	struct xfs_scrub		*sc,
602 	xfs_dablk_t			lblk,
603 	struct xfs_buf			*dbp,
604 	unsigned int			len)
605 {
606 	struct xfs_dir2_data_free	*dfp;
607 
608 	dfp = xfs_dir2_data_bestfree_p(sc->mp, dbp->b_addr);
609 
610 	if (len != be16_to_cpu(dfp->length))
611 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
612 
613 	if (len > 0 && be16_to_cpu(dfp->offset) == 0)
614 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
615 }
616 
617 /* Check free space info in a directory leaf1 block. */
618 STATIC int
619 xchk_directory_leaf1_bestfree(
620 	struct xfs_scrub		*sc,
621 	struct xfs_da_args		*args,
622 	xfs_dir2_db_t			last_data_db,
623 	xfs_dablk_t			lblk)
624 {
625 	struct xfs_dir3_icleaf_hdr	leafhdr;
626 	struct xfs_dir2_leaf_tail	*ltp;
627 	struct xfs_dir2_leaf		*leaf;
628 	struct xfs_buf			*dbp;
629 	struct xfs_buf			*bp;
630 	struct xfs_da_geometry		*geo = sc->mp->m_dir_geo;
631 	__be16				*bestp;
632 	__u16				best;
633 	__u32				hash;
634 	__u32				lasthash = 0;
635 	__u32				bestcount;
636 	unsigned int			stale = 0;
637 	int				i;
638 	int				error;
639 
640 	/* Read the free space block. */
641 	error = xfs_dir3_leaf_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, &bp);
642 	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
643 		return error;
644 	xchk_buffer_recheck(sc, bp);
645 
646 	leaf = bp->b_addr;
647 	xfs_dir2_leaf_hdr_from_disk(sc->ip->i_mount, &leafhdr, leaf);
648 	ltp = xfs_dir2_leaf_tail_p(geo, leaf);
649 	bestcount = be32_to_cpu(ltp->bestcount);
650 	bestp = xfs_dir2_leaf_bests_p(ltp);
651 
652 	if (xfs_has_crc(sc->mp)) {
653 		struct xfs_dir3_leaf_hdr	*hdr3 = bp->b_addr;
654 
655 		if (hdr3->pad != cpu_to_be32(0))
656 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
657 	}
658 
659 	/*
660 	 * There must be enough bestfree slots to cover all the directory data
661 	 * blocks that we scanned.  It is possible for there to be a hole
662 	 * between the last data block and i_disk_size.  This seems like an
663 	 * oversight to the scrub author, but as we have been writing out
664 	 * directories like this (and xfs_repair doesn't mind them) for years,
665 	 * that's what we have to check.
666 	 */
667 	if (bestcount != last_data_db + 1) {
668 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
669 		goto out;
670 	}
671 
672 	/* Is the leaf count even remotely sane? */
673 	if (leafhdr.count > geo->leaf_max_ents) {
674 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
675 		goto out;
676 	}
677 
678 	/* Leaves and bests don't overlap in leaf format. */
679 	if ((char *)&leafhdr.ents[leafhdr.count] > (char *)bestp) {
680 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
681 		goto out;
682 	}
683 
684 	/* Check hash value order, count stale entries.  */
685 	for (i = 0; i < leafhdr.count; i++) {
686 		hash = be32_to_cpu(leafhdr.ents[i].hashval);
687 		if (i > 0 && lasthash > hash)
688 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
689 		lasthash = hash;
690 		if (leafhdr.ents[i].address ==
691 		    cpu_to_be32(XFS_DIR2_NULL_DATAPTR))
692 			stale++;
693 	}
694 	if (leafhdr.stale != stale)
695 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
696 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
697 		goto out;
698 
699 	/* Check all the bestfree entries. */
700 	for (i = 0; i < bestcount; i++, bestp++) {
701 		best = be16_to_cpu(*bestp);
702 		error = xfs_dir3_data_read(sc->tp, sc->ip, args->owner,
703 				xfs_dir2_db_to_da(args->geo, i),
704 				XFS_DABUF_MAP_HOLE_OK, &dbp);
705 		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
706 				&error))
707 			break;
708 
709 		if (!dbp) {
710 			if (best != NULLDATAOFF) {
711 				xchk_fblock_set_corrupt(sc, XFS_DATA_FORK,
712 						lblk);
713 				break;
714 			}
715 			continue;
716 		}
717 
718 		if (best == NULLDATAOFF)
719 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
720 		else
721 			xchk_directory_check_freesp(sc, lblk, dbp, best);
722 		xfs_trans_brelse(sc->tp, dbp);
723 		if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
724 			break;
725 	}
726 out:
727 	xfs_trans_brelse(sc->tp, bp);
728 	return error;
729 }
730 
731 /* Check free space info in a directory freespace block. */
732 STATIC int
733 xchk_directory_free_bestfree(
734 	struct xfs_scrub		*sc,
735 	struct xfs_da_args		*args,
736 	xfs_dablk_t			lblk)
737 {
738 	struct xfs_dir3_icfree_hdr	freehdr;
739 	struct xfs_buf			*dbp;
740 	struct xfs_buf			*bp;
741 	__u16				best;
742 	unsigned int			stale = 0;
743 	int				i;
744 	int				error;
745 
746 	/* Read the free space block */
747 	error = xfs_dir2_free_read(sc->tp, sc->ip, sc->ip->i_ino, lblk, &bp);
748 	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
749 		return error;
750 	xchk_buffer_recheck(sc, bp);
751 
752 	if (xfs_has_crc(sc->mp)) {
753 		struct xfs_dir3_free_hdr	*hdr3 = bp->b_addr;
754 
755 		if (hdr3->pad != cpu_to_be32(0))
756 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
757 	}
758 
759 	/* Check all the entries. */
760 	xfs_dir2_free_hdr_from_disk(sc->ip->i_mount, &freehdr, bp->b_addr);
761 	for (i = 0; i < freehdr.nvalid; i++) {
762 		best = be16_to_cpu(freehdr.bests[i]);
763 		if (best == NULLDATAOFF) {
764 			stale++;
765 			continue;
766 		}
767 		error = xfs_dir3_data_read(sc->tp, sc->ip, args->owner,
768 				(freehdr.firstdb + i) * args->geo->fsbcount,
769 				0, &dbp);
770 		if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk,
771 				&error))
772 			goto out;
773 		xchk_directory_check_freesp(sc, lblk, dbp, best);
774 		xfs_trans_brelse(sc->tp, dbp);
775 	}
776 
777 	if (freehdr.nused + stale != freehdr.nvalid)
778 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
779 out:
780 	xfs_trans_brelse(sc->tp, bp);
781 	return error;
782 }
783 
784 /* Check free space information in directories. */
785 STATIC int
786 xchk_directory_blocks(
787 	struct xfs_scrub	*sc)
788 {
789 	struct xfs_bmbt_irec	got;
790 	struct xfs_da_args	args = {
791 		.dp		= sc->ip,
792 		.whichfork	= XFS_DATA_FORK,
793 		.geo		= sc->mp->m_dir_geo,
794 		.trans		= sc->tp,
795 		.owner		= sc->ip->i_ino,
796 	};
797 	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
798 	struct xfs_mount	*mp = sc->mp;
799 	xfs_fileoff_t		leaf_lblk;
800 	xfs_fileoff_t		free_lblk;
801 	xfs_fileoff_t		lblk;
802 	struct xfs_iext_cursor	icur;
803 	xfs_dablk_t		dabno;
804 	xfs_dir2_db_t		last_data_db = 0;
805 	bool			found;
806 	bool			is_block = false;
807 	int			error;
808 
809 	/* Ignore local format directories. */
810 	if (ifp->if_format != XFS_DINODE_FMT_EXTENTS &&
811 	    ifp->if_format != XFS_DINODE_FMT_BTREE)
812 		return 0;
813 
814 	lblk = XFS_B_TO_FSB(mp, XFS_DIR2_DATA_OFFSET);
815 	leaf_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_LEAF_OFFSET);
816 	free_lblk = XFS_B_TO_FSB(mp, XFS_DIR2_FREE_OFFSET);
817 
818 	/* Is this a block dir? */
819 	if (xfs_dir2_format(&args, &error) == XFS_DIR2_FMT_BLOCK)
820 		is_block = true;
821 	if (!xchk_fblock_process_error(sc, XFS_DATA_FORK, lblk, &error))
822 		goto out;
823 
824 	/* Iterate all the data extents in the directory... */
825 	found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
826 	while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
827 		/* No more data blocks... */
828 		if (got.br_startoff >= leaf_lblk)
829 			break;
830 
831 		/*
832 		 * Check each data block's bestfree data.
833 		 *
834 		 * Iterate all the fsbcount-aligned block offsets in
835 		 * this directory.  The directory block reading code is
836 		 * smart enough to do its own bmap lookups to handle
837 		 * discontiguous directory blocks.  When we're done
838 		 * with the extent record, re-query the bmap at the
839 		 * next fsbcount-aligned offset to avoid redundant
840 		 * block checks.
841 		 */
842 		for (lblk = roundup((xfs_dablk_t)got.br_startoff,
843 				args.geo->fsbcount);
844 		     lblk < got.br_startoff + got.br_blockcount;
845 		     lblk += args.geo->fsbcount) {
846 			last_data_db = xfs_dir2_da_to_db(args.geo, lblk);
847 			error = xchk_directory_data_bestfree(sc, lblk,
848 					is_block);
849 			if (error)
850 				goto out;
851 		}
852 		dabno = got.br_startoff + got.br_blockcount;
853 		lblk = roundup(dabno, args.geo->fsbcount);
854 		found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
855 	}
856 
857 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
858 		goto out;
859 
860 	/* Look for a leaf1 block, which has free info. */
861 	if (xfs_iext_lookup_extent(sc->ip, ifp, leaf_lblk, &icur, &got) &&
862 	    got.br_startoff == leaf_lblk &&
863 	    got.br_blockcount == args.geo->fsbcount &&
864 	    !xfs_iext_next_extent(ifp, &icur, &got)) {
865 		if (is_block) {
866 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
867 			goto out;
868 		}
869 		error = xchk_directory_leaf1_bestfree(sc, &args, last_data_db,
870 				leaf_lblk);
871 		if (error)
872 			goto out;
873 	}
874 
875 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
876 		goto out;
877 
878 	/* Scan for free blocks */
879 	lblk = free_lblk;
880 	found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
881 	while (found && !(sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)) {
882 		/*
883 		 * Dirs can't have blocks mapped above 2^32.
884 		 * Single-block dirs shouldn't even be here.
885 		 */
886 		lblk = got.br_startoff;
887 		if (lblk & ~0xFFFFFFFFULL) {
888 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
889 			goto out;
890 		}
891 		if (is_block) {
892 			xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, lblk);
893 			goto out;
894 		}
895 
896 		/*
897 		 * Check each dir free block's bestfree data.
898 		 *
899 		 * Iterate all the fsbcount-aligned block offsets in
900 		 * this directory.  The directory block reading code is
901 		 * smart enough to do its own bmap lookups to handle
902 		 * discontiguous directory blocks.  When we're done
903 		 * with the extent record, re-query the bmap at the
904 		 * next fsbcount-aligned offset to avoid redundant
905 		 * block checks.
906 		 */
907 		for (lblk = roundup((xfs_dablk_t)got.br_startoff,
908 				args.geo->fsbcount);
909 		     lblk < got.br_startoff + got.br_blockcount;
910 		     lblk += args.geo->fsbcount) {
911 			error = xchk_directory_free_bestfree(sc, &args,
912 					lblk);
913 			if (error)
914 				goto out;
915 		}
916 		dabno = got.br_startoff + got.br_blockcount;
917 		lblk = roundup(dabno, args.geo->fsbcount);
918 		found = xfs_iext_lookup_extent(sc->ip, ifp, lblk, &icur, &got);
919 	}
920 out:
921 	return error;
922 }
923 
924 /*
925  * Revalidate a dirent that we collected in the past but couldn't check because
926  * of lock contention.  Returns 0 if the dirent is still valid, -ENOENT if it
927  * has gone away on us, or a negative errno.
928  */
929 STATIC int
930 xchk_dir_revalidate_dirent(
931 	struct xchk_dir		*sd,
932 	const struct xfs_name	*xname,
933 	xfs_ino_t		ino)
934 {
935 	struct xfs_scrub	*sc = sd->sc;
936 	xfs_ino_t		child_ino;
937 	int			error;
938 
939 	/*
940 	 * Look up the directory entry.  If we get -ENOENT, the directory entry
941 	 * went away and there's nothing to revalidate.  Return any other
942 	 * error.
943 	 */
944 	error = xchk_dir_lookup(sc, sc->ip, xname, &child_ino);
945 	if (error)
946 		return error;
947 
948 	/* The inode number changed, nothing to revalidate. */
949 	if (ino != child_ino)
950 		return -ENOENT;
951 
952 	return 0;
953 }
954 
955 /*
956  * Check a directory entry's parent pointers the slow way, which means we cycle
957  * locks a bunch and put up with revalidation until we get it done.
958  */
959 STATIC int
960 xchk_dir_slow_dirent(
961 	struct xchk_dir		*sd,
962 	struct xchk_dirent	*dirent,
963 	const struct xfs_name	*xname)
964 {
965 	struct xfs_scrub	*sc = sd->sc;
966 	struct xfs_inode	*ip;
967 	unsigned int		lockmode;
968 	int			error;
969 
970 	/* Check that the deferred dirent still exists. */
971 	if (sd->need_revalidate) {
972 		error = xchk_dir_revalidate_dirent(sd, xname, dirent->ino);
973 		if (error == -ENOENT)
974 			return 0;
975 		if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0,
976 					&error))
977 			return error;
978 	}
979 
980 	error = xchk_iget(sc, dirent->ino, &ip);
981 	if (error == -EINVAL || error == -ENOENT) {
982 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
983 		return 0;
984 	}
985 	if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
986 		return error;
987 
988 	/*
989 	 * If we can grab both IOLOCK and ILOCK of the alleged child, we can
990 	 * proceed with the validation.
991 	 */
992 	lockmode = xchk_dir_lock_child(sc, ip);
993 	if (lockmode) {
994 		trace_xchk_dir_slowpath(sc->ip, xname, ip->i_ino);
995 		goto check_pptr;
996 	}
997 
998 	/*
999 	 * We couldn't lock the child file.  Drop all the locks and try to
1000 	 * get them again, one at a time.
1001 	 */
1002 	xchk_iunlock(sc, sc->ilock_flags);
1003 	sd->need_revalidate = true;
1004 
1005 	trace_xchk_dir_ultraslowpath(sc->ip, xname, ip->i_ino);
1006 
1007 	error = xchk_dir_trylock_for_pptrs(sc, ip, &lockmode);
1008 	if (error)
1009 		goto out_rele;
1010 
1011 	/* Revalidate, since we just cycled the locks. */
1012 	error = xchk_dir_revalidate_dirent(sd, xname, dirent->ino);
1013 	if (error == -ENOENT) {
1014 		error = 0;
1015 		goto out_unlock;
1016 	}
1017 	if (!xchk_fblock_xref_process_error(sc, XFS_DATA_FORK, 0, &error))
1018 		goto out_unlock;
1019 
1020 check_pptr:
1021 	error = xchk_dir_parent_pointer(sd, xname, ip);
1022 out_unlock:
1023 	xfs_iunlock(ip, lockmode);
1024 out_rele:
1025 	xchk_irele(sc, ip);
1026 	return error;
1027 }
1028 
1029 /* Check all the dirents that we deferred the first time around. */
1030 STATIC int
1031 xchk_dir_finish_slow_dirents(
1032 	struct xchk_dir		*sd)
1033 {
1034 	xfarray_idx_t		array_cur;
1035 	int			error;
1036 
1037 	foreach_xfarray_idx(sd->dir_entries, array_cur) {
1038 		struct xchk_dirent	dirent;
1039 
1040 		if (sd->sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
1041 			return 0;
1042 
1043 		error = xfarray_load(sd->dir_entries, array_cur, &dirent);
1044 		if (error)
1045 			return error;
1046 
1047 		error = xfblob_loadname(sd->dir_names, dirent.name_cookie,
1048 				&sd->xname, dirent.namelen);
1049 		if (error)
1050 			return error;
1051 
1052 		error = xchk_dir_slow_dirent(sd, &dirent, &sd->xname);
1053 		if (error)
1054 			return error;
1055 	}
1056 
1057 	return 0;
1058 }
1059 
1060 /* Scrub a whole directory. */
1061 int
1062 xchk_directory(
1063 	struct xfs_scrub	*sc)
1064 {
1065 	struct xchk_dir		*sd;
1066 	int			error;
1067 
1068 	if (!S_ISDIR(VFS_I(sc->ip)->i_mode))
1069 		return -ENOENT;
1070 
1071 	if (xchk_file_looks_zapped(sc, XFS_SICK_INO_DIR_ZAPPED)) {
1072 		xchk_fblock_set_corrupt(sc, XFS_DATA_FORK, 0);
1073 		return 0;
1074 	}
1075 
1076 	/* Plausible size? */
1077 	if (sc->ip->i_disk_size < xfs_dir2_sf_hdr_size(0)) {
1078 		xchk_ino_set_corrupt(sc, sc->ip->i_ino);
1079 		return 0;
1080 	}
1081 
1082 	/* Check directory tree structure */
1083 	error = xchk_da_btree(sc, XFS_DATA_FORK, xchk_dir_rec, NULL);
1084 	if (error)
1085 		return error;
1086 
1087 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
1088 		return 0;
1089 
1090 	/* Check the freespace. */
1091 	error = xchk_directory_blocks(sc);
1092 	if (error)
1093 		return error;
1094 
1095 	if (sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT)
1096 		return 0;
1097 
1098 	sd = kvzalloc(sizeof(struct xchk_dir), XCHK_GFP_FLAGS);
1099 	if (!sd)
1100 		return -ENOMEM;
1101 	sd->sc = sc;
1102 	sd->xname.name = sd->namebuf;
1103 
1104 	if (xfs_has_parent(sc->mp)) {
1105 		char		*descr;
1106 
1107 		/*
1108 		 * Set up some staging memory for dirents that we can't check
1109 		 * due to locking contention.
1110 		 */
1111 		descr = xchk_xfile_ino_descr(sc, "slow directory entries");
1112 		error = xfarray_create(descr, 0, sizeof(struct xchk_dirent),
1113 				&sd->dir_entries);
1114 		kfree(descr);
1115 		if (error)
1116 			goto out_sd;
1117 
1118 		descr = xchk_xfile_ino_descr(sc, "slow directory entry names");
1119 		error = xfblob_create(descr, &sd->dir_names);
1120 		kfree(descr);
1121 		if (error)
1122 			goto out_entries;
1123 	}
1124 
1125 	/* Look up every name in this directory by hash. */
1126 	error = xchk_dir_walk(sc, sc->ip, xchk_dir_actor, sd);
1127 	if (error == -ECANCELED)
1128 		error = 0;
1129 	if (error)
1130 		goto out_names;
1131 
1132 	if (xfs_has_parent(sc->mp)) {
1133 		error = xchk_dir_finish_slow_dirents(sd);
1134 		if (error == -ETIMEDOUT) {
1135 			/* Couldn't grab a lock, scrub was marked incomplete */
1136 			error = 0;
1137 			goto out_names;
1138 		}
1139 		if (error)
1140 			goto out_names;
1141 	}
1142 
1143 out_names:
1144 	if (sd->dir_names)
1145 		xfblob_destroy(sd->dir_names);
1146 out_entries:
1147 	if (sd->dir_entries)
1148 		xfarray_destroy(sd->dir_entries);
1149 out_sd:
1150 	kvfree(sd);
1151 	if (error)
1152 		return error;
1153 
1154 	/* If the dir is clean, it is clearly not zapped. */
1155 	xchk_mark_healthy_if_clean(sc, XFS_SICK_INO_DIR_ZAPPED);
1156 	return 0;
1157 }
1158 
1159 /*
1160  * Decide if this directory has been zapped to satisfy the inode and ifork
1161  * verifiers.  Checking and repairing should be postponed until the directory
1162  * is fixed.
1163  */
1164 bool
1165 xchk_dir_looks_zapped(
1166 	struct xfs_inode	*dp)
1167 {
1168 	/* Repair zapped this dir's data fork a short time ago */
1169 	if (xfs_ifork_zapped(dp, XFS_DATA_FORK))
1170 		return true;
1171 
1172 	/*
1173 	 * If the dinode repair found a bad data fork, it will reset the fork
1174 	 * to extents format with zero records and wait for the bmapbtd
1175 	 * scrubber to reconstruct the block mappings.  Directories always
1176 	 * contain some content, so this is a clear sign of a zapped directory.
1177 	 * The state checked by xfs_ifork_zapped is not persisted, so this is
1178 	 * the secondary strategy if repairs are interrupted by a crash or an
1179 	 * unmount.
1180 	 */
1181 	return dp->i_df.if_format == XFS_DINODE_FMT_EXTENTS &&
1182 	       dp->i_df.if_nextents == 0;
1183 }
1184