xref: /linux/fs/xfs/scrub/dir_repair.c (revision 1e58a8ccf2597c9259a8e71a2bffac5e11e12ea0)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_bit.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans.h"
16 #include "xfs_sb.h"
17 #include "xfs_inode.h"
18 #include "xfs_icache.h"
19 #include "xfs_da_format.h"
20 #include "xfs_da_btree.h"
21 #include "xfs_dir2.h"
22 #include "xfs_dir2_priv.h"
23 #include "xfs_bmap.h"
24 #include "xfs_quota.h"
25 #include "xfs_bmap_btree.h"
26 #include "xfs_trans_space.h"
27 #include "xfs_bmap_util.h"
28 #include "xfs_exchmaps.h"
29 #include "xfs_exchrange.h"
30 #include "xfs_ag.h"
31 #include "scrub/xfs_scrub.h"
32 #include "scrub/scrub.h"
33 #include "scrub/common.h"
34 #include "scrub/trace.h"
35 #include "scrub/repair.h"
36 #include "scrub/tempfile.h"
37 #include "scrub/tempexch.h"
38 #include "scrub/xfile.h"
39 #include "scrub/xfarray.h"
40 #include "scrub/xfblob.h"
41 #include "scrub/iscan.h"
42 #include "scrub/readdir.h"
43 #include "scrub/reap.h"
44 #include "scrub/findparent.h"
45 #include "scrub/orphanage.h"
46 
47 /*
48  * Directory Repair
49  * ================
50  *
51  * We repair directories by reading the directory data blocks looking for
52  * directory entries that look salvageable (name passes verifiers, entry points
53  * to a valid allocated inode, etc).  Each entry worth salvaging is stashed in
54  * memory, and the stashed entries are periodically replayed into a temporary
55  * directory to constrain memory use.  Batching the construction of the
56  * temporary directory in this fashion reduces lock cycling of the directory
57  * being repaired and the temporary directory, and will later become important
58  * for parent pointer scanning.
59  *
60  * Directory entries added to the temporary directory do not elevate the link
61  * counts of the inodes found.  When salvaging completes, the remaining stashed
62  * entries are replayed to the temporary directory.  An atomic mapping exchange
63  * is used to commit the new directory blocks to the directory being repaired.
64  * This will disrupt readdir cursors.
65  *
66  * Locking Issues
67  * --------------
68  *
69  * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on
70  * /a/b for a "mv /a/b /c/" operation.  This means that only b's ILOCK protects
71  * b's dotdot update.  This is in contrast to every other dotdot update (link,
72  * remove, mkdir).  If the repair code drops the ILOCK, it must either
73  * revalidate the dotdot entry or use dirent hooks to capture updates from
74  * other threads.
75  */
76 
77 /* Directory entry to be restored in the new directory. */
78 struct xrep_dirent {
79 	/* Cookie for retrieval of the dirent name. */
80 	xfblob_cookie		name_cookie;
81 
82 	/* Target inode number. */
83 	xfs_ino_t		ino;
84 
85 	/* Length of the dirent name. */
86 	uint8_t			namelen;
87 
88 	/* File type of the dirent. */
89 	uint8_t			ftype;
90 };
91 
92 /*
93  * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names
94  * before we write them to the temp dir.
95  */
96 #define XREP_DIR_MAX_STASH_BYTES	(PAGE_SIZE * 8)
97 
98 struct xrep_dir {
99 	struct xfs_scrub	*sc;
100 
101 	/* Fixed-size array of xrep_dirent structures. */
102 	struct xfarray		*dir_entries;
103 
104 	/* Blobs containing directory entry names. */
105 	struct xfblob		*dir_names;
106 
107 	/* Information for exchanging data forks at the end. */
108 	struct xrep_tempexch	tx;
109 
110 	/* Preallocated args struct for performing dir operations */
111 	struct xfs_da_args	args;
112 
113 	/*
114 	 * Information used to scan the filesystem to find the inumber of the
115 	 * dotdot entry for this directory.
116 	 */
117 	struct xrep_parent_scan_info pscan;
118 
119 	/*
120 	 * Context information for attaching this directory to the lost+found
121 	 * if this directory does not have a parent.
122 	 */
123 	struct xrep_adoption	adoption;
124 
125 	/* How many subdirectories did we find? */
126 	uint64_t		subdirs;
127 
128 	/* How many dirents did we find? */
129 	unsigned int		dirents;
130 
131 	/* Should we move this directory to the orphanage? */
132 	bool			needs_adoption;
133 
134 	/* Directory entry name, plus the trailing null. */
135 	struct xfs_name		xname;
136 	unsigned char		namebuf[MAXNAMELEN];
137 };
138 
139 /* Tear down all the incore stuff we created. */
140 static void
141 xrep_dir_teardown(
142 	struct xfs_scrub	*sc)
143 {
144 	struct xrep_dir		*rd = sc->buf;
145 
146 	xrep_findparent_scan_teardown(&rd->pscan);
147 	xfblob_destroy(rd->dir_names);
148 	xfarray_destroy(rd->dir_entries);
149 }
150 
151 /* Set up for a directory repair. */
152 int
153 xrep_setup_directory(
154 	struct xfs_scrub	*sc)
155 {
156 	struct xrep_dir		*rd;
157 	int			error;
158 
159 	xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
160 
161 	error = xrep_orphanage_try_create(sc);
162 	if (error)
163 		return error;
164 
165 	error = xrep_tempfile_create(sc, S_IFDIR);
166 	if (error)
167 		return error;
168 
169 	rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS);
170 	if (!rd)
171 		return -ENOMEM;
172 	rd->sc = sc;
173 	rd->xname.name = rd->namebuf;
174 	sc->buf = rd;
175 
176 	return 0;
177 }
178 
179 /*
180  * If we're the root of a directory tree, we are our own parent.  If we're an
181  * unlinked directory, the parent /won't/ have a link to us.  Set the parent
182  * directory to the root for both cases.  Returns NULLFSINO if we don't know
183  * what to do.
184  */
185 static inline xfs_ino_t
186 xrep_dir_self_parent(
187 	struct xrep_dir		*rd)
188 {
189 	struct xfs_scrub	*sc = rd->sc;
190 
191 	if (sc->ip->i_ino == sc->mp->m_sb.sb_rootino)
192 		return sc->mp->m_sb.sb_rootino;
193 
194 	if (VFS_I(sc->ip)->i_nlink == 0)
195 		return sc->mp->m_sb.sb_rootino;
196 
197 	return NULLFSINO;
198 }
199 
200 /*
201  * Look up the dotdot entry and confirm that it's really the parent.
202  * Returns NULLFSINO if we don't know what to do.
203  */
204 static inline xfs_ino_t
205 xrep_dir_lookup_parent(
206 	struct xrep_dir		*rd)
207 {
208 	struct xfs_scrub	*sc = rd->sc;
209 	xfs_ino_t		ino;
210 	int			error;
211 
212 	error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL);
213 	if (error)
214 		return NULLFSINO;
215 	if (!xfs_verify_dir_ino(sc->mp, ino))
216 		return NULLFSINO;
217 
218 	error = xrep_findparent_confirm(sc, &ino);
219 	if (error)
220 		return NULLFSINO;
221 
222 	return ino;
223 }
224 
225 /*
226  * Look up '..' in the dentry cache and confirm that it's really the parent.
227  * Returns NULLFSINO if the dcache misses or if the hit is implausible.
228  */
229 static inline xfs_ino_t
230 xrep_dir_dcache_parent(
231 	struct xrep_dir		*rd)
232 {
233 	struct xfs_scrub	*sc = rd->sc;
234 	xfs_ino_t		parent_ino;
235 	int			error;
236 
237 	parent_ino = xrep_findparent_from_dcache(sc);
238 	if (parent_ino == NULLFSINO)
239 		return parent_ino;
240 
241 	error = xrep_findparent_confirm(sc, &parent_ino);
242 	if (error)
243 		return NULLFSINO;
244 
245 	return parent_ino;
246 }
247 
248 /* Try to find the parent of the directory being repaired. */
249 STATIC int
250 xrep_dir_find_parent(
251 	struct xrep_dir		*rd)
252 {
253 	xfs_ino_t		ino;
254 
255 	ino = xrep_findparent_self_reference(rd->sc);
256 	if (ino != NULLFSINO) {
257 		xrep_findparent_scan_finish_early(&rd->pscan, ino);
258 		return 0;
259 	}
260 
261 	ino = xrep_dir_dcache_parent(rd);
262 	if (ino != NULLFSINO) {
263 		xrep_findparent_scan_finish_early(&rd->pscan, ino);
264 		return 0;
265 	}
266 
267 	ino = xrep_dir_lookup_parent(rd);
268 	if (ino != NULLFSINO) {
269 		xrep_findparent_scan_finish_early(&rd->pscan, ino);
270 		return 0;
271 	}
272 
273 	/*
274 	 * A full filesystem scan is the last resort.  On a busy filesystem,
275 	 * the scan can fail with -EBUSY if we cannot grab IOLOCKs.  That means
276 	 * that we don't know what who the parent is, so we should return to
277 	 * userspace.
278 	 */
279 	return xrep_findparent_scan(&rd->pscan);
280 }
281 
282 /*
283  * Decide if we want to salvage this entry.  We don't bother with oversized
284  * names or the dot entry.
285  */
286 STATIC int
287 xrep_dir_want_salvage(
288 	struct xrep_dir		*rd,
289 	const char		*name,
290 	int			namelen,
291 	xfs_ino_t		ino)
292 {
293 	struct xfs_mount	*mp = rd->sc->mp;
294 
295 	/* No pointers to ourselves or to garbage. */
296 	if (ino == rd->sc->ip->i_ino)
297 		return false;
298 	if (!xfs_verify_dir_ino(mp, ino))
299 		return false;
300 
301 	/* No weird looking names or dot entries. */
302 	if (namelen >= MAXNAMELEN || namelen <= 0)
303 		return false;
304 	if (namelen == 1 && name[0] == '.')
305 		return false;
306 	if (!xfs_dir2_namecheck(name, namelen))
307 		return false;
308 
309 	return true;
310 }
311 
312 /*
313  * Remember that we want to create a dirent in the tempdir.  These stashed
314  * actions will be replayed later.
315  */
316 STATIC int
317 xrep_dir_stash_createname(
318 	struct xrep_dir		*rd,
319 	const struct xfs_name	*name,
320 	xfs_ino_t		ino)
321 {
322 	struct xrep_dirent	dirent = {
323 		.ino		= ino,
324 		.namelen	= name->len,
325 		.ftype		= name->type,
326 	};
327 	int			error;
328 
329 	trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino);
330 
331 	error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
332 	if (error)
333 		return error;
334 
335 	return xfarray_append(rd->dir_entries, &dirent);
336 }
337 
338 /* Allocate an in-core record to hold entries while we rebuild the dir data. */
339 STATIC int
340 xrep_dir_salvage_entry(
341 	struct xrep_dir		*rd,
342 	unsigned char		*name,
343 	unsigned int		namelen,
344 	xfs_ino_t		ino)
345 {
346 	struct xfs_name		xname = {
347 		.name		= name,
348 	};
349 	struct xfs_scrub	*sc = rd->sc;
350 	struct xfs_inode	*ip;
351 	unsigned int		i = 0;
352 	int			error = 0;
353 
354 	if (xchk_should_terminate(sc, &error))
355 		return error;
356 
357 	/*
358 	 * Truncate the name to the first character that would trip namecheck.
359 	 * If we no longer have a name after that, ignore this entry.
360 	 */
361 	while (i < namelen && name[i] != 0 && name[i] != '/')
362 		i++;
363 	if (i == 0)
364 		return 0;
365 	xname.len = i;
366 
367 	/* Ignore '..' entries; we already picked the new parent. */
368 	if (xname.len == 2 && name[0] == '.' && name[1] == '.') {
369 		trace_xrep_dir_salvaged_parent(sc->ip, ino);
370 		return 0;
371 	}
372 
373 	trace_xrep_dir_salvage_entry(sc->ip, &xname, ino);
374 
375 	/*
376 	 * Compute the ftype or dump the entry if we can't.  We don't lock the
377 	 * inode because inodes can't change type while we have a reference.
378 	 */
379 	error = xchk_iget(sc, ino, &ip);
380 	if (error)
381 		return 0;
382 
383 	xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
384 	xchk_irele(sc, ip);
385 
386 	return xrep_dir_stash_createname(rd, &xname, ino);
387 }
388 
389 /* Record a shortform directory entry for later reinsertion. */
390 STATIC int
391 xrep_dir_salvage_sf_entry(
392 	struct xrep_dir			*rd,
393 	struct xfs_dir2_sf_hdr		*sfp,
394 	struct xfs_dir2_sf_entry	*sfep)
395 {
396 	xfs_ino_t			ino;
397 
398 	ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep);
399 	if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino))
400 		return 0;
401 
402 	return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino);
403 }
404 
405 /* Record a regular directory entry for later reinsertion. */
406 STATIC int
407 xrep_dir_salvage_data_entry(
408 	struct xrep_dir			*rd,
409 	struct xfs_dir2_data_entry	*dep)
410 {
411 	xfs_ino_t			ino;
412 
413 	ino = be64_to_cpu(dep->inumber);
414 	if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino))
415 		return 0;
416 
417 	return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino);
418 }
419 
420 /* Try to recover block/data format directory entries. */
421 STATIC int
422 xrep_dir_recover_data(
423 	struct xrep_dir		*rd,
424 	struct xfs_buf		*bp)
425 {
426 	struct xfs_da_geometry	*geo = rd->sc->mp->m_dir_geo;
427 	unsigned int		offset;
428 	unsigned int		end;
429 	int			error = 0;
430 
431 	/*
432 	 * Loop over the data portion of the block.
433 	 * Each object is a real entry (dep) or an unused one (dup).
434 	 */
435 	offset = geo->data_entry_offset;
436 	end = min_t(unsigned int, BBTOB(bp->b_length),
437 			xfs_dir3_data_end_offset(geo, bp->b_addr));
438 
439 	while (offset < end) {
440 		struct xfs_dir2_data_unused	*dup = bp->b_addr + offset;
441 		struct xfs_dir2_data_entry	*dep = bp->b_addr + offset;
442 
443 		if (xchk_should_terminate(rd->sc, &error))
444 			return error;
445 
446 		/* Skip unused entries. */
447 		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
448 			offset += be16_to_cpu(dup->length);
449 			continue;
450 		}
451 
452 		/* Don't walk off the end of the block. */
453 		offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen);
454 		if (offset > end)
455 			break;
456 
457 		/* Ok, let's save this entry. */
458 		error = xrep_dir_salvage_data_entry(rd, dep);
459 		if (error)
460 			return error;
461 
462 	}
463 
464 	return 0;
465 }
466 
467 /* Try to recover shortform directory entries. */
468 STATIC int
469 xrep_dir_recover_sf(
470 	struct xrep_dir			*rd)
471 {
472 	struct xfs_dir2_sf_hdr		*hdr;
473 	struct xfs_dir2_sf_entry	*sfep;
474 	struct xfs_dir2_sf_entry	*next;
475 	struct xfs_ifork		*ifp;
476 	xfs_ino_t			ino;
477 	unsigned char			*end;
478 	int				error = 0;
479 
480 	ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK);
481 	hdr = ifp->if_data;
482 	end = (unsigned char *)ifp->if_data + ifp->if_bytes;
483 
484 	ino = xfs_dir2_sf_get_parent_ino(hdr);
485 	trace_xrep_dir_salvaged_parent(rd->sc->ip, ino);
486 
487 	sfep = xfs_dir2_sf_firstentry(hdr);
488 	while ((unsigned char *)sfep < end) {
489 		if (xchk_should_terminate(rd->sc, &error))
490 			return error;
491 
492 		next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep);
493 		if ((unsigned char *)next > end)
494 			break;
495 
496 		/* Ok, let's save this entry. */
497 		error = xrep_dir_salvage_sf_entry(rd, hdr, sfep);
498 		if (error)
499 			return error;
500 
501 		sfep = next;
502 	}
503 
504 	return 0;
505 }
506 
507 /*
508  * Try to figure out the format of this directory from the data fork mappings
509  * and the directory size.  If we can be reasonably sure of format, we can be
510  * more aggressive in salvaging directory entries.  On return, @magic_guess
511  * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format"
512  * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory,
513  * and 0 if we can't tell.
514  */
515 STATIC void
516 xrep_dir_guess_format(
517 	struct xrep_dir		*rd,
518 	__be32			*magic_guess)
519 {
520 	struct xfs_inode	*dp = rd->sc->ip;
521 	struct xfs_mount	*mp = rd->sc->mp;
522 	struct xfs_da_geometry	*geo = mp->m_dir_geo;
523 	xfs_fileoff_t		last;
524 	int			error;
525 
526 	ASSERT(xfs_has_crc(mp));
527 
528 	*magic_guess = 0;
529 
530 	/*
531 	 * If there's a single directory block and the directory size is
532 	 * exactly one block, this has to be a single block format directory.
533 	 */
534 	error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK);
535 	if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize &&
536 	    dp->i_disk_size == geo->blksize) {
537 		*magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
538 		return;
539 	}
540 
541 	/*
542 	 * If the last extent before the leaf offset matches the directory
543 	 * size and the directory size is larger than 1 block, this is a
544 	 * data format directory.
545 	 */
546 	last = geo->leafblk;
547 	error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK);
548 	if (!error &&
549 	    XFS_FSB_TO_B(mp, last) > geo->blksize &&
550 	    XFS_FSB_TO_B(mp, last) == dp->i_disk_size) {
551 		*magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
552 		return;
553 	}
554 }
555 
556 /* Recover directory entries from a specific directory block. */
557 STATIC int
558 xrep_dir_recover_dirblock(
559 	struct xrep_dir		*rd,
560 	__be32			magic_guess,
561 	xfs_dablk_t		dabno)
562 {
563 	struct xfs_dir2_data_hdr *hdr;
564 	struct xfs_buf		*bp;
565 	__be32			oldmagic;
566 	int			error;
567 
568 	/*
569 	 * Try to read buffer.  We invalidate them in the next step so we don't
570 	 * bother to set a buffer type or ops.
571 	 */
572 	error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno,
573 			XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL);
574 	if (error || !bp)
575 		return error;
576 
577 	hdr = bp->b_addr;
578 	oldmagic = hdr->magic;
579 
580 	trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno,
581 			be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess));
582 
583 	/*
584 	 * If we're sure of the block's format, proceed with the salvage
585 	 * operation using the specified magic number.
586 	 */
587 	if (magic_guess) {
588 		hdr->magic = magic_guess;
589 		goto recover;
590 	}
591 
592 	/*
593 	 * If we couldn't guess what type of directory this is, then we will
594 	 * only salvage entries from directory blocks that match the magic
595 	 * number and pass verifiers.
596 	 */
597 	switch (hdr->magic) {
598 	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
599 	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
600 		if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops))
601 			goto out;
602 		if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL)
603 			goto out;
604 		break;
605 	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
606 	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
607 		if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops))
608 			goto out;
609 		if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL)
610 			goto out;
611 		break;
612 	default:
613 		goto out;
614 	}
615 
616 recover:
617 	error = xrep_dir_recover_data(rd, bp);
618 
619 out:
620 	hdr->magic = oldmagic;
621 	xfs_trans_brelse(rd->sc->tp, bp);
622 	return error;
623 }
624 
625 static inline void
626 xrep_dir_init_args(
627 	struct xrep_dir		*rd,
628 	struct xfs_inode	*dp,
629 	const struct xfs_name	*name)
630 {
631 	memset(&rd->args, 0, sizeof(struct xfs_da_args));
632 	rd->args.geo = rd->sc->mp->m_dir_geo;
633 	rd->args.whichfork = XFS_DATA_FORK;
634 	rd->args.owner = rd->sc->ip->i_ino;
635 	rd->args.trans = rd->sc->tp;
636 	rd->args.dp = dp;
637 	if (!name)
638 		return;
639 	rd->args.name = name->name;
640 	rd->args.namelen = name->len;
641 	rd->args.filetype = name->type;
642 	rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name);
643 }
644 
645 /* Replay a stashed createname into the temporary directory. */
646 STATIC int
647 xrep_dir_replay_createname(
648 	struct xrep_dir		*rd,
649 	const struct xfs_name	*name,
650 	xfs_ino_t		inum,
651 	xfs_extlen_t		total)
652 {
653 	struct xfs_scrub	*sc = rd->sc;
654 	struct xfs_inode	*dp = rd->sc->tempip;
655 	bool			is_block, is_leaf;
656 	int			error;
657 
658 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
659 
660 	error = xfs_dir_ino_validate(sc->mp, inum);
661 	if (error)
662 		return error;
663 
664 	trace_xrep_dir_replay_createname(dp, name, inum);
665 
666 	xrep_dir_init_args(rd, dp, name);
667 	rd->args.inumber = inum;
668 	rd->args.total = total;
669 	rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
670 
671 	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
672 		return xfs_dir2_sf_addname(&rd->args);
673 
674 	error = xfs_dir2_isblock(&rd->args, &is_block);
675 	if (error)
676 		return error;
677 	if (is_block)
678 		return xfs_dir2_block_addname(&rd->args);
679 
680 	error = xfs_dir2_isleaf(&rd->args, &is_leaf);
681 	if (error)
682 		return error;
683 	if (is_leaf)
684 		return xfs_dir2_leaf_addname(&rd->args);
685 
686 	return xfs_dir2_node_addname(&rd->args);
687 }
688 
689 /*
690  * Add this stashed incore directory entry to the temporary directory.
691  * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and
692  * must not be in transaction context.
693  */
694 STATIC int
695 xrep_dir_replay_update(
696 	struct xrep_dir			*rd,
697 	const struct xfs_name		*xname,
698 	const struct xrep_dirent	*dirent)
699 {
700 	struct xfs_mount		*mp = rd->sc->mp;
701 #ifdef DEBUG
702 	xfs_ino_t			ino;
703 #endif
704 	uint				resblks;
705 	int				error;
706 
707 	resblks = XFS_LINK_SPACE_RES(mp, xname->len);
708 	error = xchk_trans_alloc(rd->sc, resblks);
709 	if (error)
710 		return error;
711 
712 	/* Lock the temporary directory and join it to the transaction */
713 	xrep_tempfile_ilock(rd->sc);
714 	xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0);
715 
716 	/*
717 	 * Create a replacement dirent in the temporary directory.  Note that
718 	 * _createname doesn't check for existing entries.  There shouldn't be
719 	 * any in the temporary dir, but we'll verify this in debug mode.
720 	 */
721 #ifdef DEBUG
722 	error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
723 	if (error != -ENOENT) {
724 		ASSERT(error != -ENOENT);
725 		goto out_cancel;
726 	}
727 #endif
728 
729 	error = xrep_dir_replay_createname(rd, xname, dirent->ino, resblks);
730 	if (error)
731 		goto out_cancel;
732 
733 	if (xname->type == XFS_DIR3_FT_DIR)
734 		rd->subdirs++;
735 	rd->dirents++;
736 
737 	/* Commit and unlock. */
738 	error = xrep_trans_commit(rd->sc);
739 	if (error)
740 		return error;
741 
742 	xrep_tempfile_iunlock(rd->sc);
743 	return 0;
744 out_cancel:
745 	xchk_trans_cancel(rd->sc);
746 	xrep_tempfile_iunlock(rd->sc);
747 	return error;
748 }
749 
750 /*
751  * Flush stashed incore dirent updates that have been recorded by the scanner.
752  * This is done to reduce the memory requirements of the directory rebuild,
753  * since directories can contain up to 32GB of directory data.
754  *
755  * Caller must not hold transactions or ILOCKs.  Caller must hold the tempdir
756  * IOLOCK.
757  */
758 STATIC int
759 xrep_dir_replay_updates(
760 	struct xrep_dir		*rd)
761 {
762 	xfarray_idx_t		array_cur;
763 	int			error;
764 
765 	/* Add all the salvaged dirents to the temporary directory. */
766 	foreach_xfarray_idx(rd->dir_entries, array_cur) {
767 		struct xrep_dirent	dirent;
768 
769 		error = xfarray_load(rd->dir_entries, array_cur, &dirent);
770 		if (error)
771 			return error;
772 
773 		error = xfblob_loadname(rd->dir_names, dirent.name_cookie,
774 				&rd->xname, dirent.namelen);
775 		if (error)
776 			return error;
777 		rd->xname.type = dirent.ftype;
778 
779 		error = xrep_dir_replay_update(rd, &rd->xname, &dirent);
780 		if (error)
781 			return error;
782 	}
783 
784 	/* Empty out both arrays now that we've added the entries. */
785 	xfarray_truncate(rd->dir_entries);
786 	xfblob_truncate(rd->dir_names);
787 	return 0;
788 }
789 
790 /*
791  * Periodically flush stashed directory entries to the temporary dir.  This
792  * is done to reduce the memory requirements of the directory rebuild, since
793  * directories can contain up to 32GB of directory data.
794  */
795 STATIC int
796 xrep_dir_flush_stashed(
797 	struct xrep_dir		*rd)
798 {
799 	int			error;
800 
801 	/*
802 	 * Entering this function, the scrub context has a reference to the
803 	 * inode being repaired, the temporary file, and a scrub transaction
804 	 * that we use during dirent salvaging to avoid livelocking if there
805 	 * are cycles in the directory structures.  We hold ILOCK_EXCL on both
806 	 * the inode being repaired and the temporary file, though they are
807 	 * not ijoined to the scrub transaction.
808 	 *
809 	 * To constrain kernel memory use, we occasionally write salvaged
810 	 * dirents from the xfarray and xfblob structures into the temporary
811 	 * directory in preparation for exchanging the directory structures at
812 	 * the end.  Updating the temporary file requires a transaction, so we
813 	 * commit the scrub transaction and drop the two ILOCKs so that
814 	 * we can allocate whatever transaction we want.
815 	 *
816 	 * We still hold IOLOCK_EXCL on the inode being repaired, which
817 	 * prevents anyone from accessing the damaged directory data while we
818 	 * repair it.
819 	 */
820 	error = xrep_trans_commit(rd->sc);
821 	if (error)
822 		return error;
823 	xchk_iunlock(rd->sc, XFS_ILOCK_EXCL);
824 
825 	/*
826 	 * Take the IOLOCK of the temporary file while we modify dirents.  This
827 	 * isn't strictly required because the temporary file is never revealed
828 	 * to userspace, but we follow the same locking rules.  We still hold
829 	 * sc->ip's IOLOCK.
830 	 */
831 	error = xrep_tempfile_iolock_polled(rd->sc);
832 	if (error)
833 		return error;
834 
835 	/* Write to the tempdir all the updates that we've stashed. */
836 	error = xrep_dir_replay_updates(rd);
837 	xrep_tempfile_iounlock(rd->sc);
838 	if (error)
839 		return error;
840 
841 	/*
842 	 * Recreate the salvage transaction and relock the dir we're salvaging.
843 	 */
844 	error = xchk_trans_alloc(rd->sc, 0);
845 	if (error)
846 		return error;
847 	xchk_ilock(rd->sc, XFS_ILOCK_EXCL);
848 	return 0;
849 }
850 
851 /* Decide if we've stashed too much dirent data in memory. */
852 static inline bool
853 xrep_dir_want_flush_stashed(
854 	struct xrep_dir		*rd)
855 {
856 	unsigned long long	bytes;
857 
858 	bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names);
859 	return bytes > XREP_DIR_MAX_STASH_BYTES;
860 }
861 
862 /* Extract as many directory entries as we can. */
863 STATIC int
864 xrep_dir_recover(
865 	struct xrep_dir		*rd)
866 {
867 	struct xfs_bmbt_irec	got;
868 	struct xfs_scrub	*sc = rd->sc;
869 	struct xfs_da_geometry	*geo = sc->mp->m_dir_geo;
870 	xfs_fileoff_t		offset;
871 	xfs_dablk_t		dabno;
872 	__be32			magic_guess;
873 	int			nmap;
874 	int			error;
875 
876 	xrep_dir_guess_format(rd, &magic_guess);
877 
878 	/* Iterate each directory data block in the data fork. */
879 	for (offset = 0;
880 	     offset < geo->leafblk;
881 	     offset = got.br_startoff + got.br_blockcount) {
882 		nmap = 1;
883 		error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset,
884 				&got, &nmap, 0);
885 		if (error)
886 			return error;
887 		if (nmap != 1)
888 			return -EFSCORRUPTED;
889 		if (!xfs_bmap_is_written_extent(&got))
890 			continue;
891 
892 		for (dabno = round_up(got.br_startoff, geo->fsbcount);
893 		     dabno < got.br_startoff + got.br_blockcount;
894 		     dabno += geo->fsbcount) {
895 			if (xchk_should_terminate(rd->sc, &error))
896 				return error;
897 
898 			error = xrep_dir_recover_dirblock(rd,
899 					magic_guess, dabno);
900 			if (error)
901 				return error;
902 
903 			/* Flush dirents to constrain memory usage. */
904 			if (xrep_dir_want_flush_stashed(rd)) {
905 				error = xrep_dir_flush_stashed(rd);
906 				if (error)
907 					return error;
908 			}
909 		}
910 	}
911 
912 	return 0;
913 }
914 
915 /*
916  * Find all the directory entries for this inode by scraping them out of the
917  * directory leaf blocks by hand, and flushing them into the temp dir.
918  */
919 STATIC int
920 xrep_dir_find_entries(
921 	struct xrep_dir		*rd)
922 {
923 	struct xfs_inode	*dp = rd->sc->ip;
924 	int			error;
925 
926 	/*
927 	 * Salvage directory entries from the old directory, and write them to
928 	 * the temporary directory.
929 	 */
930 	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
931 		error = xrep_dir_recover_sf(rd);
932 	} else {
933 		error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK);
934 		if (error)
935 			return error;
936 
937 		error = xrep_dir_recover(rd);
938 	}
939 	if (error)
940 		return error;
941 
942 	return xrep_dir_flush_stashed(rd);
943 }
944 
945 /* Scan all files in the filesystem for dirents. */
946 STATIC int
947 xrep_dir_salvage_entries(
948 	struct xrep_dir		*rd)
949 {
950 	struct xfs_scrub	*sc = rd->sc;
951 	int			error;
952 
953 	/*
954 	 * Drop the ILOCK on this directory so that we can scan for this
955 	 * directory's parent.  Figure out who is going to be the parent of
956 	 * this directory, then retake the ILOCK so that we can salvage
957 	 * directory entries.
958 	 */
959 	xchk_iunlock(sc, XFS_ILOCK_EXCL);
960 	error = xrep_dir_find_parent(rd);
961 	xchk_ilock(sc, XFS_ILOCK_EXCL);
962 	if (error)
963 		return error;
964 
965 	/*
966 	 * Collect directory entries by parsing raw leaf blocks to salvage
967 	 * whatever we can.  When we're done, free the staging memory before
968 	 * exchanging the directories to reduce memory usage.
969 	 */
970 	error = xrep_dir_find_entries(rd);
971 	if (error)
972 		return error;
973 
974 	/*
975 	 * Cancel the repair transaction and drop the ILOCK so that we can
976 	 * (later) use the atomic mapping exchange functions to compute the
977 	 * correct block reservations and re-lock the inodes.
978 	 *
979 	 * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory
980 	 * modifications, but there's nothing to prevent userspace from reading
981 	 * the directory until we're ready for the exchange operation.  Reads
982 	 * will return -EIO without shutting down the fs, so we're ok with
983 	 * that.
984 	 *
985 	 * The VFS can change dotdot on us, but the findparent scan will keep
986 	 * our incore parent inode up to date.  See the note on locking issues
987 	 * for more details.
988 	 */
989 	error = xrep_trans_commit(sc);
990 	if (error)
991 		return error;
992 
993 	xchk_iunlock(sc, XFS_ILOCK_EXCL);
994 	return 0;
995 }
996 
997 
998 /*
999  * Free all the directory blocks and reset the data fork.  The caller must
1000  * join the inode to the transaction.  This function returns with the inode
1001  * joined to a clean scrub transaction.
1002  */
1003 STATIC int
1004 xrep_dir_reset_fork(
1005 	struct xrep_dir		*rd,
1006 	xfs_ino_t		parent_ino)
1007 {
1008 	struct xfs_scrub	*sc = rd->sc;
1009 	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK);
1010 	int			error;
1011 
1012 	/* Unmap all the directory buffers. */
1013 	if (xfs_ifork_has_extents(ifp)) {
1014 		error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
1015 		if (error)
1016 			return error;
1017 	}
1018 
1019 	trace_xrep_dir_reset_fork(sc->tempip, parent_ino);
1020 
1021 	/* Reset the data fork to an empty data fork. */
1022 	xfs_idestroy_fork(ifp);
1023 	ifp->if_bytes = 0;
1024 	sc->tempip->i_disk_size = 0;
1025 
1026 	/* Reinitialize the short form directory. */
1027 	xrep_dir_init_args(rd, sc->tempip, NULL);
1028 	return xfs_dir2_sf_create(&rd->args, parent_ino);
1029 }
1030 
1031 /*
1032  * Prepare both inodes' directory forks for exchanging mappings.  Promote the
1033  * tempfile from short format to leaf format, and if the file being repaired
1034  * has a short format data fork, turn it into an empty extent list.
1035  */
1036 STATIC int
1037 xrep_dir_swap_prep(
1038 	struct xfs_scrub	*sc,
1039 	bool			temp_local,
1040 	bool			ip_local)
1041 {
1042 	int			error;
1043 
1044 	/*
1045 	 * If the tempfile's directory is in shortform format, convert that to
1046 	 * a single leaf extent so that we can use the atomic mapping exchange.
1047 	 */
1048 	if (temp_local) {
1049 		struct xfs_da_args	args = {
1050 			.dp		= sc->tempip,
1051 			.geo		= sc->mp->m_dir_geo,
1052 			.whichfork	= XFS_DATA_FORK,
1053 			.trans		= sc->tp,
1054 			.total		= 1,
1055 			.owner		= sc->ip->i_ino,
1056 		};
1057 
1058 		error = xfs_dir2_sf_to_block(&args);
1059 		if (error)
1060 			return error;
1061 
1062 		/*
1063 		 * Roll the deferred log items to get us back to a clean
1064 		 * transaction.
1065 		 */
1066 		error = xfs_defer_finish(&sc->tp);
1067 		if (error)
1068 			return error;
1069 	}
1070 
1071 	/*
1072 	 * If the file being repaired had a shortform data fork, convert that
1073 	 * to an empty extent list in preparation for the atomic mapping
1074 	 * exchange.
1075 	 */
1076 	if (ip_local) {
1077 		struct xfs_ifork	*ifp;
1078 
1079 		ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1080 		xfs_idestroy_fork(ifp);
1081 		ifp->if_format = XFS_DINODE_FMT_EXTENTS;
1082 		ifp->if_nextents = 0;
1083 		ifp->if_bytes = 0;
1084 		ifp->if_data = NULL;
1085 		ifp->if_height = 0;
1086 
1087 		xfs_trans_log_inode(sc->tp, sc->ip,
1088 				XFS_ILOG_CORE | XFS_ILOG_DDATA);
1089 	}
1090 
1091 	return 0;
1092 }
1093 
1094 /*
1095  * Replace the inode number of a directory entry.
1096  */
1097 static int
1098 xrep_dir_replace(
1099 	struct xrep_dir		*rd,
1100 	struct xfs_inode	*dp,
1101 	const struct xfs_name	*name,
1102 	xfs_ino_t		inum,
1103 	xfs_extlen_t		total)
1104 {
1105 	struct xfs_scrub	*sc = rd->sc;
1106 	bool			is_block, is_leaf;
1107 	int			error;
1108 
1109 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
1110 
1111 	error = xfs_dir_ino_validate(sc->mp, inum);
1112 	if (error)
1113 		return error;
1114 
1115 	xrep_dir_init_args(rd, dp, name);
1116 	rd->args.inumber = inum;
1117 	rd->args.total = total;
1118 
1119 	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL)
1120 		return xfs_dir2_sf_replace(&rd->args);
1121 
1122 	error = xfs_dir2_isblock(&rd->args, &is_block);
1123 	if (error)
1124 		return error;
1125 	if (is_block)
1126 		return xfs_dir2_block_replace(&rd->args);
1127 
1128 	error = xfs_dir2_isleaf(&rd->args, &is_leaf);
1129 	if (error)
1130 		return error;
1131 	if (is_leaf)
1132 		return xfs_dir2_leaf_replace(&rd->args);
1133 
1134 	return xfs_dir2_node_replace(&rd->args);
1135 }
1136 
1137 /*
1138  * Reset the link count of this directory and adjust the unlinked list pointers
1139  * as needed.
1140  */
1141 STATIC int
1142 xrep_dir_set_nlink(
1143 	struct xrep_dir		*rd)
1144 {
1145 	struct xfs_scrub	*sc = rd->sc;
1146 	struct xfs_inode	*dp = sc->ip;
1147 	struct xfs_perag	*pag;
1148 	unsigned int		new_nlink = rd->subdirs + 2;
1149 	int			error;
1150 
1151 	/*
1152 	 * The directory is not on the incore unlinked list, which means that
1153 	 * it needs to be reachable via the directory tree.  Update the nlink
1154 	 * with our observed link count.  If the directory has no parent, it
1155 	 * will be moved to the orphanage.
1156 	 */
1157 	if (!xfs_inode_on_unlinked_list(dp))
1158 		goto reset_nlink;
1159 
1160 	/*
1161 	 * The directory is on the unlinked list and we did not find any
1162 	 * dirents.  Set the link count to zero and let the directory
1163 	 * inactivate when the last reference drops.
1164 	 */
1165 	if (rd->dirents == 0) {
1166 		rd->needs_adoption = false;
1167 		new_nlink = 0;
1168 		goto reset_nlink;
1169 	}
1170 
1171 	/*
1172 	 * The directory is on the unlinked list and we found dirents.  This
1173 	 * directory needs to be reachable via the directory tree.  Remove the
1174 	 * dir from the unlinked list and update nlink with the observed link
1175 	 * count.  If the directory has no parent, it will be moved to the
1176 	 * orphanage.
1177 	 */
1178 	pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino));
1179 	if (!pag) {
1180 		ASSERT(0);
1181 		return -EFSCORRUPTED;
1182 	}
1183 
1184 	error = xfs_iunlink_remove(sc->tp, pag, dp);
1185 	xfs_perag_put(pag);
1186 	if (error)
1187 		return error;
1188 
1189 reset_nlink:
1190 	if (VFS_I(dp)->i_nlink != new_nlink)
1191 		set_nlink(VFS_I(dp), new_nlink);
1192 	return 0;
1193 }
1194 
1195 /* Exchange the temporary directory's data fork with the one being repaired. */
1196 STATIC int
1197 xrep_dir_swap(
1198 	struct xrep_dir		*rd)
1199 {
1200 	struct xfs_scrub	*sc = rd->sc;
1201 	bool			ip_local, temp_local;
1202 	int			error = 0;
1203 
1204 	/*
1205 	 * If we found enough subdirs to overflow this directory's link count,
1206 	 * bail out to userspace before we modify anything.
1207 	 */
1208 	if (rd->subdirs + 2 > XFS_MAXLINK)
1209 		return -EFSCORRUPTED;
1210 
1211 	/*
1212 	 * If we never found the parent for this directory, temporarily assign
1213 	 * the root dir as the parent; we'll move this to the orphanage after
1214 	 * exchanging the dir contents.  We hold the ILOCK of the dir being
1215 	 * repaired, so we're not worried about racy updates of dotdot.
1216 	 */
1217 	ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
1218 	if (rd->pscan.parent_ino == NULLFSINO) {
1219 		rd->needs_adoption = true;
1220 		rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino;
1221 	}
1222 
1223 	/*
1224 	 * Reset the temporary directory's '..' entry to point to the parent
1225 	 * that we found.  The temporary directory was created with the root
1226 	 * directory as the parent, so we can skip this if repairing a
1227 	 * subdirectory of the root.
1228 	 *
1229 	 * It's also possible that this replacement could also expand a sf
1230 	 * tempdir into block format.
1231 	 */
1232 	if (rd->pscan.parent_ino != sc->mp->m_rootip->i_ino) {
1233 		error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot,
1234 				rd->pscan.parent_ino, rd->tx.req.resblks);
1235 		if (error)
1236 			return error;
1237 	}
1238 
1239 	/*
1240 	 * Changing the dot and dotdot entries could have changed the shape of
1241 	 * the directory, so we recompute these.
1242 	 */
1243 	ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
1244 	temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
1245 
1246 	/*
1247 	 * If the both files have a local format data fork and the rebuilt
1248 	 * directory data would fit in the repaired file's data fork, copy
1249 	 * the contents from the tempfile and update the directory link count.
1250 	 * We're done now.
1251 	 */
1252 	if (ip_local && temp_local &&
1253 	    sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) {
1254 		xrep_tempfile_copyout_local(sc, XFS_DATA_FORK);
1255 		return xrep_dir_set_nlink(rd);
1256 	}
1257 
1258 	/*
1259 	 * Clean the transaction before we start working on exchanging
1260 	 * directory contents.
1261 	 */
1262 	error = xrep_tempfile_roll_trans(rd->sc);
1263 	if (error)
1264 		return error;
1265 
1266 	/* Otherwise, make sure both data forks are in block-mapping mode. */
1267 	error = xrep_dir_swap_prep(sc, temp_local, ip_local);
1268 	if (error)
1269 		return error;
1270 
1271 	/*
1272 	 * Set nlink of the directory in the same transaction sequence that
1273 	 * (atomically) commits the new directory data.
1274 	 */
1275 	error = xrep_dir_set_nlink(rd);
1276 	if (error)
1277 		return error;
1278 
1279 	return xrep_tempexch_contents(sc, &rd->tx);
1280 }
1281 
1282 /*
1283  * Exchange the new directory contents (which we created in the tempfile) with
1284  * the directory being repaired.
1285  */
1286 STATIC int
1287 xrep_dir_rebuild_tree(
1288 	struct xrep_dir		*rd)
1289 {
1290 	struct xfs_scrub	*sc = rd->sc;
1291 	int			error;
1292 
1293 	trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino);
1294 
1295 	/*
1296 	 * Take the IOLOCK on the temporary file so that we can run dir
1297 	 * operations with the same locks held as we would for a normal file.
1298 	 * We still hold sc->ip's IOLOCK.
1299 	 */
1300 	error = xrep_tempfile_iolock_polled(rd->sc);
1301 	if (error)
1302 		return error;
1303 
1304 	/* Allocate transaction and ILOCK the scrub file and the temp file. */
1305 	error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
1306 	if (error)
1307 		return error;
1308 
1309 	/*
1310 	 * Exchange the tempdir's data fork with the file being repaired.  This
1311 	 * recreates the transaction and re-takes the ILOCK in the scrub
1312 	 * context.
1313 	 */
1314 	error = xrep_dir_swap(rd);
1315 	if (error)
1316 		return error;
1317 
1318 	/*
1319 	 * Release the old directory blocks and reset the data fork of the temp
1320 	 * directory to an empty shortform directory because inactivation does
1321 	 * nothing for directories.
1322 	 */
1323 	error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino);
1324 	if (error)
1325 		return error;
1326 
1327 	/*
1328 	 * Roll to get a transaction without any inodes joined to it.  Then we
1329 	 * can drop the tempfile's ILOCK and IOLOCK before doing more work on
1330 	 * the scrub target directory.
1331 	 */
1332 	error = xfs_trans_roll(&sc->tp);
1333 	if (error)
1334 		return error;
1335 
1336 	xrep_tempfile_iunlock(sc);
1337 	xrep_tempfile_iounlock(sc);
1338 	return 0;
1339 }
1340 
1341 /* Set up the filesystem scan so we can regenerate directory entries. */
1342 STATIC int
1343 xrep_dir_setup_scan(
1344 	struct xrep_dir		*rd)
1345 {
1346 	struct xfs_scrub	*sc = rd->sc;
1347 	char			*descr;
1348 	int			error;
1349 
1350 	/* Set up some staging memory for salvaging dirents. */
1351 	descr = xchk_xfile_ino_descr(sc, "directory entries");
1352 	error = xfarray_create(descr, 0, sizeof(struct xrep_dirent),
1353 			&rd->dir_entries);
1354 	kfree(descr);
1355 	if (error)
1356 		return error;
1357 
1358 	descr = xchk_xfile_ino_descr(sc, "directory entry names");
1359 	error = xfblob_create(descr, &rd->dir_names);
1360 	kfree(descr);
1361 	if (error)
1362 		goto out_xfarray;
1363 
1364 	error = xrep_findparent_scan_start(sc, &rd->pscan);
1365 	if (error)
1366 		goto out_xfblob;
1367 
1368 	return 0;
1369 
1370 out_xfblob:
1371 	xfblob_destroy(rd->dir_names);
1372 	rd->dir_names = NULL;
1373 out_xfarray:
1374 	xfarray_destroy(rd->dir_entries);
1375 	rd->dir_entries = NULL;
1376 	return error;
1377 }
1378 
1379 /*
1380  * Move the current file to the orphanage.
1381  *
1382  * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks.  Upon
1383  * successful return, the scrub transaction will have enough extra reservation
1384  * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the
1385  * orphanage; and both inodes will be ijoined.
1386  */
1387 STATIC int
1388 xrep_dir_move_to_orphanage(
1389 	struct xrep_dir		*rd)
1390 {
1391 	struct xfs_scrub	*sc = rd->sc;
1392 	xfs_ino_t		orig_parent, new_parent;
1393 	int			error;
1394 
1395 	/*
1396 	 * We are about to drop the ILOCK on sc->ip to lock the orphanage and
1397 	 * prepare for the adoption.  Therefore, look up the old dotdot entry
1398 	 * for sc->ip so that we can compare it after we re-lock sc->ip.
1399 	 */
1400 	error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent);
1401 	if (error)
1402 		return error;
1403 
1404 	/*
1405 	 * Drop the ILOCK on the scrub target and commit the transaction.
1406 	 * Adoption computes its own resource requirements and gathers the
1407 	 * necessary components.
1408 	 */
1409 	error = xrep_trans_commit(sc);
1410 	if (error)
1411 		return error;
1412 	xchk_iunlock(sc, XFS_ILOCK_EXCL);
1413 
1414 	/* If we can take the orphanage's iolock then we're ready to move. */
1415 	if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) {
1416 		xchk_iunlock(sc, sc->ilock_flags);
1417 		error = xrep_orphanage_iolock_two(sc);
1418 		if (error)
1419 			return error;
1420 	}
1421 
1422 	/* Grab transaction and ILOCK the two files. */
1423 	error = xrep_adoption_trans_alloc(sc, &rd->adoption);
1424 	if (error)
1425 		return error;
1426 
1427 	error = xrep_adoption_compute_name(&rd->adoption, &rd->xname);
1428 	if (error)
1429 		return error;
1430 
1431 	/*
1432 	 * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot
1433 	 * entry again.  If the parent changed or the child was unlinked while
1434 	 * the child directory was unlocked, we don't need to move the child to
1435 	 * the orphanage after all.
1436 	 */
1437 	error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent);
1438 	if (error)
1439 		return error;
1440 
1441 	/*
1442 	 * Attach to the orphanage if we still have a linked directory and it
1443 	 * hasn't been moved.
1444 	 */
1445 	if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) {
1446 		error = xrep_adoption_move(&rd->adoption);
1447 		if (error)
1448 			return error;
1449 	}
1450 
1451 	/*
1452 	 * Launder the scrub transaction so we can drop the orphanage ILOCK
1453 	 * and IOLOCK.  Return holding the scrub target's ILOCK and IOLOCK.
1454 	 */
1455 	error = xrep_adoption_trans_roll(&rd->adoption);
1456 	if (error)
1457 		return error;
1458 
1459 	xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
1460 	xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
1461 	return 0;
1462 }
1463 
1464 /*
1465  * Repair the directory metadata.
1466  *
1467  * XXX: Directory entry buffers can be multiple fsblocks in size.  The buffer
1468  * cache in XFS can't handle aliased multiblock buffers, so this might
1469  * misbehave if the directory blocks are crosslinked with other filesystem
1470  * metadata.
1471  *
1472  * XXX: Is it necessary to check the dcache for this directory to make sure
1473  * that we always recreate every cached entry?
1474  */
1475 int
1476 xrep_directory(
1477 	struct xfs_scrub	*sc)
1478 {
1479 	struct xrep_dir		*rd = sc->buf;
1480 	int			error;
1481 
1482 	/* The rmapbt is required to reap the old data fork. */
1483 	if (!xfs_has_rmapbt(sc->mp))
1484 		return -EOPNOTSUPP;
1485 
1486 	error = xrep_dir_setup_scan(rd);
1487 	if (error)
1488 		return error;
1489 
1490 	error = xrep_dir_salvage_entries(rd);
1491 	if (error)
1492 		goto out_teardown;
1493 
1494 	/* Last chance to abort before we start committing fixes. */
1495 	if (xchk_should_terminate(sc, &error))
1496 		goto out_teardown;
1497 
1498 	error = xrep_dir_rebuild_tree(rd);
1499 	if (error)
1500 		goto out_teardown;
1501 
1502 	if (rd->needs_adoption) {
1503 		if (!xrep_orphanage_can_adopt(rd->sc))
1504 			error = -EFSCORRUPTED;
1505 		else
1506 			error = xrep_dir_move_to_orphanage(rd);
1507 		if (error)
1508 			goto out_teardown;
1509 	}
1510 
1511 out_teardown:
1512 	xrep_dir_teardown(sc);
1513 	return error;
1514 }
1515