xref: /linux/fs/xfs/scrub/dir_repair.c (revision bf36793fa260cb68cc817f311f1f683788261796)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_bit.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans.h"
16 #include "xfs_sb.h"
17 #include "xfs_inode.h"
18 #include "xfs_icache.h"
19 #include "xfs_da_format.h"
20 #include "xfs_da_btree.h"
21 #include "xfs_dir2.h"
22 #include "xfs_dir2_priv.h"
23 #include "xfs_bmap.h"
24 #include "xfs_quota.h"
25 #include "xfs_bmap_btree.h"
26 #include "xfs_trans_space.h"
27 #include "xfs_bmap_util.h"
28 #include "xfs_exchmaps.h"
29 #include "xfs_exchrange.h"
30 #include "xfs_ag.h"
31 #include "xfs_parent.h"
32 #include "scrub/xfs_scrub.h"
33 #include "scrub/scrub.h"
34 #include "scrub/common.h"
35 #include "scrub/trace.h"
36 #include "scrub/repair.h"
37 #include "scrub/tempfile.h"
38 #include "scrub/tempexch.h"
39 #include "scrub/xfile.h"
40 #include "scrub/xfarray.h"
41 #include "scrub/xfblob.h"
42 #include "scrub/iscan.h"
43 #include "scrub/readdir.h"
44 #include "scrub/reap.h"
45 #include "scrub/findparent.h"
46 #include "scrub/orphanage.h"
47 #include "scrub/listxattr.h"
48 
49 /*
50  * Directory Repair
51  * ================
52  *
53  * We repair directories by reading the directory data blocks looking for
54  * directory entries that look salvageable (name passes verifiers, entry points
55  * to a valid allocated inode, etc).  Each entry worth salvaging is stashed in
56  * memory, and the stashed entries are periodically replayed into a temporary
57  * directory to constrain memory use.  Batching the construction of the
58  * temporary directory in this fashion reduces lock cycling of the directory
59  * being repaired and the temporary directory, and will later become important
60  * for parent pointer scanning.
61  *
62  * If parent pointers are enabled on this filesystem, we instead reconstruct
63  * the directory by visiting each parent pointer of each file in the filesystem
64  * and translating the relevant parent pointer records into dirents.  In this
65  * case, it is advantageous to stash all directory entries created from parent
66  * pointers for a single child file before replaying them into the temporary
67  * directory.  To save memory, the live filesystem scan reuses the findparent
68  * fields.  Directory repair chooses either parent pointer scanning or
69  * directory entry salvaging, but not both.
70  *
71  * Directory entries added to the temporary directory do not elevate the link
72  * counts of the inodes found.  When salvaging completes, the remaining stashed
73  * entries are replayed to the temporary directory.  An atomic mapping exchange
74  * is used to commit the new directory blocks to the directory being repaired.
75  * This will disrupt readdir cursors.
76  *
77  * Locking Issues
78  * --------------
79  *
80  * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on
81  * /a/b for a "mv /a/b /c/" operation.  This means that only b's ILOCK protects
82  * b's dotdot update.  This is in contrast to every other dotdot update (link,
83  * remove, mkdir).  If the repair code drops the ILOCK, it must either
84  * revalidate the dotdot entry or use dirent hooks to capture updates from
85  * other threads.
86  */
87 
88 /* Create a dirent in the tempdir. */
89 #define XREP_DIRENT_ADD		(1)
90 
91 /* Remove a dirent from the tempdir. */
92 #define XREP_DIRENT_REMOVE	(2)
93 
94 /* Directory entry to be restored in the new directory. */
95 struct xrep_dirent {
96 	/* Cookie for retrieval of the dirent name. */
97 	xfblob_cookie		name_cookie;
98 
99 	/* Target inode number. */
100 	xfs_ino_t		ino;
101 
102 	/* Length of the dirent name. */
103 	uint8_t			namelen;
104 
105 	/* File type of the dirent. */
106 	uint8_t			ftype;
107 
108 	/* XREP_DIRENT_{ADD,REMOVE} */
109 	uint8_t			action;
110 };
111 
112 /*
113  * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names
114  * before we write them to the temp dir.
115  */
116 #define XREP_DIR_MAX_STASH_BYTES	(PAGE_SIZE * 8)
117 
118 struct xrep_dir {
119 	struct xfs_scrub	*sc;
120 
121 	/* Fixed-size array of xrep_dirent structures. */
122 	struct xfarray		*dir_entries;
123 
124 	/* Blobs containing directory entry names. */
125 	struct xfblob		*dir_names;
126 
127 	/* Information for exchanging data forks at the end. */
128 	struct xrep_tempexch	tx;
129 
130 	/* Preallocated args struct for performing dir operations */
131 	struct xfs_da_args	args;
132 
133 	/*
134 	 * Information used to scan the filesystem to find the inumber of the
135 	 * dotdot entry for this directory.  For directory salvaging when
136 	 * parent pointers are not enabled, we use the findparent_* functions
137 	 * on this object and access only the parent_ino field directly.
138 	 *
139 	 * When parent pointers are enabled, however, the pptr scanner uses the
140 	 * iscan, hooks, lock, and parent_ino fields of this object directly.
141 	 * @pscan.lock coordinates access to dir_entries, dir_names,
142 	 * parent_ino, subdirs, dirents, and args.  This reduces the memory
143 	 * requirements of this structure.
144 	 */
145 	struct xrep_parent_scan_info pscan;
146 
147 	/*
148 	 * Context information for attaching this directory to the lost+found
149 	 * if this directory does not have a parent.
150 	 */
151 	struct xrep_adoption	adoption;
152 
153 	/* How many subdirectories did we find? */
154 	uint64_t		subdirs;
155 
156 	/* How many dirents did we find? */
157 	unsigned int		dirents;
158 
159 	/* Should we move this directory to the orphanage? */
160 	bool			needs_adoption;
161 
162 	/* Directory entry name, plus the trailing null. */
163 	struct xfs_name		xname;
164 	unsigned char		namebuf[MAXNAMELEN];
165 };
166 
167 /* Tear down all the incore stuff we created. */
168 static void
169 xrep_dir_teardown(
170 	struct xfs_scrub	*sc)
171 {
172 	struct xrep_dir		*rd = sc->buf;
173 
174 	xrep_findparent_scan_teardown(&rd->pscan);
175 	xfblob_destroy(rd->dir_names);
176 	xfarray_destroy(rd->dir_entries);
177 }
178 
179 /* Set up for a directory repair. */
180 int
181 xrep_setup_directory(
182 	struct xfs_scrub	*sc)
183 {
184 	struct xrep_dir		*rd;
185 	int			error;
186 
187 	xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
188 
189 	error = xrep_orphanage_try_create(sc);
190 	if (error)
191 		return error;
192 
193 	error = xrep_tempfile_create(sc, S_IFDIR);
194 	if (error)
195 		return error;
196 
197 	rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS);
198 	if (!rd)
199 		return -ENOMEM;
200 	rd->sc = sc;
201 	rd->xname.name = rd->namebuf;
202 	sc->buf = rd;
203 
204 	return 0;
205 }
206 
207 /*
208  * Look up the dotdot entry and confirm that it's really the parent.
209  * Returns NULLFSINO if we don't know what to do.
210  */
211 static inline xfs_ino_t
212 xrep_dir_lookup_parent(
213 	struct xrep_dir		*rd)
214 {
215 	struct xfs_scrub	*sc = rd->sc;
216 	xfs_ino_t		ino;
217 	int			error;
218 
219 	error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL);
220 	if (error)
221 		return NULLFSINO;
222 	if (!xfs_verify_dir_ino(sc->mp, ino))
223 		return NULLFSINO;
224 
225 	error = xrep_findparent_confirm(sc, &ino);
226 	if (error)
227 		return NULLFSINO;
228 
229 	return ino;
230 }
231 
232 /*
233  * Look up '..' in the dentry cache and confirm that it's really the parent.
234  * Returns NULLFSINO if the dcache misses or if the hit is implausible.
235  */
236 static inline xfs_ino_t
237 xrep_dir_dcache_parent(
238 	struct xrep_dir		*rd)
239 {
240 	struct xfs_scrub	*sc = rd->sc;
241 	xfs_ino_t		parent_ino;
242 	int			error;
243 
244 	parent_ino = xrep_findparent_from_dcache(sc);
245 	if (parent_ino == NULLFSINO)
246 		return parent_ino;
247 
248 	error = xrep_findparent_confirm(sc, &parent_ino);
249 	if (error)
250 		return NULLFSINO;
251 
252 	return parent_ino;
253 }
254 
255 /* Try to find the parent of the directory being repaired. */
256 STATIC int
257 xrep_dir_find_parent(
258 	struct xrep_dir		*rd)
259 {
260 	xfs_ino_t		ino;
261 
262 	ino = xrep_findparent_self_reference(rd->sc);
263 	if (ino != NULLFSINO) {
264 		xrep_findparent_scan_finish_early(&rd->pscan, ino);
265 		return 0;
266 	}
267 
268 	ino = xrep_dir_dcache_parent(rd);
269 	if (ino != NULLFSINO) {
270 		xrep_findparent_scan_finish_early(&rd->pscan, ino);
271 		return 0;
272 	}
273 
274 	ino = xrep_dir_lookup_parent(rd);
275 	if (ino != NULLFSINO) {
276 		xrep_findparent_scan_finish_early(&rd->pscan, ino);
277 		return 0;
278 	}
279 
280 	/*
281 	 * A full filesystem scan is the last resort.  On a busy filesystem,
282 	 * the scan can fail with -EBUSY if we cannot grab IOLOCKs.  That means
283 	 * that we don't know what who the parent is, so we should return to
284 	 * userspace.
285 	 */
286 	return xrep_findparent_scan(&rd->pscan);
287 }
288 
289 /*
290  * Decide if we want to salvage this entry.  We don't bother with oversized
291  * names or the dot entry.
292  */
293 STATIC int
294 xrep_dir_want_salvage(
295 	struct xrep_dir		*rd,
296 	const char		*name,
297 	int			namelen,
298 	xfs_ino_t		ino)
299 {
300 	struct xfs_mount	*mp = rd->sc->mp;
301 
302 	/* No pointers to ourselves or to garbage. */
303 	if (ino == rd->sc->ip->i_ino)
304 		return false;
305 	if (!xfs_verify_dir_ino(mp, ino))
306 		return false;
307 
308 	/* No weird looking names or dot entries. */
309 	if (namelen >= MAXNAMELEN || namelen <= 0)
310 		return false;
311 	if (namelen == 1 && name[0] == '.')
312 		return false;
313 	if (!xfs_dir2_namecheck(name, namelen))
314 		return false;
315 
316 	return true;
317 }
318 
319 /*
320  * Remember that we want to create a dirent in the tempdir.  These stashed
321  * actions will be replayed later.
322  */
323 STATIC int
324 xrep_dir_stash_createname(
325 	struct xrep_dir		*rd,
326 	const struct xfs_name	*name,
327 	xfs_ino_t		ino)
328 {
329 	struct xrep_dirent	dirent = {
330 		.action		= XREP_DIRENT_ADD,
331 		.ino		= ino,
332 		.namelen	= name->len,
333 		.ftype		= name->type,
334 	};
335 	int			error;
336 
337 	trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino);
338 
339 	error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
340 	if (error)
341 		return error;
342 
343 	return xfarray_append(rd->dir_entries, &dirent);
344 }
345 
346 /*
347  * Remember that we want to remove a dirent from the tempdir.  These stashed
348  * actions will be replayed later.
349  */
350 STATIC int
351 xrep_dir_stash_removename(
352 	struct xrep_dir		*rd,
353 	const struct xfs_name	*name,
354 	xfs_ino_t		ino)
355 {
356 	struct xrep_dirent	dirent = {
357 		.action		= XREP_DIRENT_REMOVE,
358 		.ino		= ino,
359 		.namelen	= name->len,
360 		.ftype		= name->type,
361 	};
362 	int			error;
363 
364 	trace_xrep_dir_stash_removename(rd->sc->tempip, name, ino);
365 
366 	error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
367 	if (error)
368 		return error;
369 
370 	return xfarray_append(rd->dir_entries, &dirent);
371 }
372 
373 /* Allocate an in-core record to hold entries while we rebuild the dir data. */
374 STATIC int
375 xrep_dir_salvage_entry(
376 	struct xrep_dir		*rd,
377 	unsigned char		*name,
378 	unsigned int		namelen,
379 	xfs_ino_t		ino)
380 {
381 	struct xfs_name		xname = {
382 		.name		= name,
383 	};
384 	struct xfs_scrub	*sc = rd->sc;
385 	struct xfs_inode	*ip;
386 	unsigned int		i = 0;
387 	int			error = 0;
388 
389 	if (xchk_should_terminate(sc, &error))
390 		return error;
391 
392 	/*
393 	 * Truncate the name to the first character that would trip namecheck.
394 	 * If we no longer have a name after that, ignore this entry.
395 	 */
396 	while (i < namelen && name[i] != 0 && name[i] != '/')
397 		i++;
398 	if (i == 0)
399 		return 0;
400 	xname.len = i;
401 
402 	/* Ignore '..' entries; we already picked the new parent. */
403 	if (xname.len == 2 && name[0] == '.' && name[1] == '.') {
404 		trace_xrep_dir_salvaged_parent(sc->ip, ino);
405 		return 0;
406 	}
407 
408 	trace_xrep_dir_salvage_entry(sc->ip, &xname, ino);
409 
410 	/*
411 	 * Compute the ftype or dump the entry if we can't.  We don't lock the
412 	 * inode because inodes can't change type while we have a reference.
413 	 */
414 	error = xchk_iget(sc, ino, &ip);
415 	if (error)
416 		return 0;
417 
418 	xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
419 	xchk_irele(sc, ip);
420 
421 	return xrep_dir_stash_createname(rd, &xname, ino);
422 }
423 
424 /* Record a shortform directory entry for later reinsertion. */
425 STATIC int
426 xrep_dir_salvage_sf_entry(
427 	struct xrep_dir			*rd,
428 	struct xfs_dir2_sf_hdr		*sfp,
429 	struct xfs_dir2_sf_entry	*sfep)
430 {
431 	xfs_ino_t			ino;
432 
433 	ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep);
434 	if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino))
435 		return 0;
436 
437 	return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino);
438 }
439 
440 /* Record a regular directory entry for later reinsertion. */
441 STATIC int
442 xrep_dir_salvage_data_entry(
443 	struct xrep_dir			*rd,
444 	struct xfs_dir2_data_entry	*dep)
445 {
446 	xfs_ino_t			ino;
447 
448 	ino = be64_to_cpu(dep->inumber);
449 	if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino))
450 		return 0;
451 
452 	return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino);
453 }
454 
455 /* Try to recover block/data format directory entries. */
456 STATIC int
457 xrep_dir_recover_data(
458 	struct xrep_dir		*rd,
459 	struct xfs_buf		*bp)
460 {
461 	struct xfs_da_geometry	*geo = rd->sc->mp->m_dir_geo;
462 	unsigned int		offset;
463 	unsigned int		end;
464 	int			error = 0;
465 
466 	/*
467 	 * Loop over the data portion of the block.
468 	 * Each object is a real entry (dep) or an unused one (dup).
469 	 */
470 	offset = geo->data_entry_offset;
471 	end = min_t(unsigned int, BBTOB(bp->b_length),
472 			xfs_dir3_data_end_offset(geo, bp->b_addr));
473 
474 	while (offset < end) {
475 		struct xfs_dir2_data_unused	*dup = bp->b_addr + offset;
476 		struct xfs_dir2_data_entry	*dep = bp->b_addr + offset;
477 
478 		if (xchk_should_terminate(rd->sc, &error))
479 			return error;
480 
481 		/* Skip unused entries. */
482 		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
483 			offset += be16_to_cpu(dup->length);
484 			continue;
485 		}
486 
487 		/* Don't walk off the end of the block. */
488 		offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen);
489 		if (offset > end)
490 			break;
491 
492 		/* Ok, let's save this entry. */
493 		error = xrep_dir_salvage_data_entry(rd, dep);
494 		if (error)
495 			return error;
496 
497 	}
498 
499 	return 0;
500 }
501 
502 /* Try to recover shortform directory entries. */
503 STATIC int
504 xrep_dir_recover_sf(
505 	struct xrep_dir			*rd)
506 {
507 	struct xfs_dir2_sf_hdr		*hdr;
508 	struct xfs_dir2_sf_entry	*sfep;
509 	struct xfs_dir2_sf_entry	*next;
510 	struct xfs_ifork		*ifp;
511 	xfs_ino_t			ino;
512 	unsigned char			*end;
513 	int				error = 0;
514 
515 	ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK);
516 	hdr = ifp->if_data;
517 	end = (unsigned char *)ifp->if_data + ifp->if_bytes;
518 
519 	ino = xfs_dir2_sf_get_parent_ino(hdr);
520 	trace_xrep_dir_salvaged_parent(rd->sc->ip, ino);
521 
522 	sfep = xfs_dir2_sf_firstentry(hdr);
523 	while ((unsigned char *)sfep < end) {
524 		if (xchk_should_terminate(rd->sc, &error))
525 			return error;
526 
527 		next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep);
528 		if ((unsigned char *)next > end)
529 			break;
530 
531 		/* Ok, let's save this entry. */
532 		error = xrep_dir_salvage_sf_entry(rd, hdr, sfep);
533 		if (error)
534 			return error;
535 
536 		sfep = next;
537 	}
538 
539 	return 0;
540 }
541 
542 /*
543  * Try to figure out the format of this directory from the data fork mappings
544  * and the directory size.  If we can be reasonably sure of format, we can be
545  * more aggressive in salvaging directory entries.  On return, @magic_guess
546  * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format"
547  * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory,
548  * and 0 if we can't tell.
549  */
550 STATIC void
551 xrep_dir_guess_format(
552 	struct xrep_dir		*rd,
553 	__be32			*magic_guess)
554 {
555 	struct xfs_inode	*dp = rd->sc->ip;
556 	struct xfs_mount	*mp = rd->sc->mp;
557 	struct xfs_da_geometry	*geo = mp->m_dir_geo;
558 	xfs_fileoff_t		last;
559 	int			error;
560 
561 	ASSERT(xfs_has_crc(mp));
562 
563 	*magic_guess = 0;
564 
565 	/*
566 	 * If there's a single directory block and the directory size is
567 	 * exactly one block, this has to be a single block format directory.
568 	 */
569 	error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK);
570 	if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize &&
571 	    dp->i_disk_size == geo->blksize) {
572 		*magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
573 		return;
574 	}
575 
576 	/*
577 	 * If the last extent before the leaf offset matches the directory
578 	 * size and the directory size is larger than 1 block, this is a
579 	 * data format directory.
580 	 */
581 	last = geo->leafblk;
582 	error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK);
583 	if (!error &&
584 	    XFS_FSB_TO_B(mp, last) > geo->blksize &&
585 	    XFS_FSB_TO_B(mp, last) == dp->i_disk_size) {
586 		*magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
587 		return;
588 	}
589 }
590 
591 /* Recover directory entries from a specific directory block. */
592 STATIC int
593 xrep_dir_recover_dirblock(
594 	struct xrep_dir		*rd,
595 	__be32			magic_guess,
596 	xfs_dablk_t		dabno)
597 {
598 	struct xfs_dir2_data_hdr *hdr;
599 	struct xfs_buf		*bp;
600 	__be32			oldmagic;
601 	int			error;
602 
603 	/*
604 	 * Try to read buffer.  We invalidate them in the next step so we don't
605 	 * bother to set a buffer type or ops.
606 	 */
607 	error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno,
608 			XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL);
609 	if (error || !bp)
610 		return error;
611 
612 	hdr = bp->b_addr;
613 	oldmagic = hdr->magic;
614 
615 	trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno,
616 			be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess));
617 
618 	/*
619 	 * If we're sure of the block's format, proceed with the salvage
620 	 * operation using the specified magic number.
621 	 */
622 	if (magic_guess) {
623 		hdr->magic = magic_guess;
624 		goto recover;
625 	}
626 
627 	/*
628 	 * If we couldn't guess what type of directory this is, then we will
629 	 * only salvage entries from directory blocks that match the magic
630 	 * number and pass verifiers.
631 	 */
632 	switch (hdr->magic) {
633 	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
634 	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
635 		if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops))
636 			goto out;
637 		if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL)
638 			goto out;
639 		break;
640 	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
641 	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
642 		if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops))
643 			goto out;
644 		if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL)
645 			goto out;
646 		break;
647 	default:
648 		goto out;
649 	}
650 
651 recover:
652 	error = xrep_dir_recover_data(rd, bp);
653 
654 out:
655 	hdr->magic = oldmagic;
656 	xfs_trans_brelse(rd->sc->tp, bp);
657 	return error;
658 }
659 
660 static inline void
661 xrep_dir_init_args(
662 	struct xrep_dir		*rd,
663 	struct xfs_inode	*dp,
664 	const struct xfs_name	*name)
665 {
666 	memset(&rd->args, 0, sizeof(struct xfs_da_args));
667 	rd->args.geo = rd->sc->mp->m_dir_geo;
668 	rd->args.whichfork = XFS_DATA_FORK;
669 	rd->args.owner = rd->sc->ip->i_ino;
670 	rd->args.trans = rd->sc->tp;
671 	rd->args.dp = dp;
672 	if (!name)
673 		return;
674 	rd->args.name = name->name;
675 	rd->args.namelen = name->len;
676 	rd->args.filetype = name->type;
677 	rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name);
678 }
679 
680 /* Replay a stashed createname into the temporary directory. */
681 STATIC int
682 xrep_dir_replay_createname(
683 	struct xrep_dir		*rd,
684 	const struct xfs_name	*name,
685 	xfs_ino_t		inum,
686 	xfs_extlen_t		total)
687 {
688 	struct xfs_scrub	*sc = rd->sc;
689 	struct xfs_inode	*dp = rd->sc->tempip;
690 	int			error;
691 
692 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
693 
694 	error = xfs_dir_ino_validate(sc->mp, inum);
695 	if (error)
696 		return error;
697 
698 	trace_xrep_dir_replay_createname(dp, name, inum);
699 
700 	xrep_dir_init_args(rd, dp, name);
701 	rd->args.inumber = inum;
702 	rd->args.total = total;
703 	rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
704 	return xfs_dir_createname_args(&rd->args);
705 }
706 
707 /* Replay a stashed removename onto the temporary directory. */
708 STATIC int
709 xrep_dir_replay_removename(
710 	struct xrep_dir		*rd,
711 	const struct xfs_name	*name,
712 	xfs_extlen_t		total)
713 {
714 	struct xfs_inode	*dp = rd->args.dp;
715 
716 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
717 
718 	xrep_dir_init_args(rd, dp, name);
719 	rd->args.op_flags = 0;
720 	rd->args.total = total;
721 
722 	trace_xrep_dir_replay_removename(dp, name, 0);
723 	return xfs_dir_removename_args(&rd->args);
724 }
725 
726 /*
727  * Add this stashed incore directory entry to the temporary directory.
728  * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and
729  * must not be in transaction context.
730  */
731 STATIC int
732 xrep_dir_replay_update(
733 	struct xrep_dir			*rd,
734 	const struct xfs_name		*xname,
735 	const struct xrep_dirent	*dirent)
736 {
737 	struct xfs_mount		*mp = rd->sc->mp;
738 #ifdef DEBUG
739 	xfs_ino_t			ino;
740 #endif
741 	uint				resblks;
742 	int				error;
743 
744 	resblks = xfs_link_space_res(mp, xname->len);
745 	error = xchk_trans_alloc(rd->sc, resblks);
746 	if (error)
747 		return error;
748 
749 	/* Lock the temporary directory and join it to the transaction */
750 	xrep_tempfile_ilock(rd->sc);
751 	xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0);
752 
753 	switch (dirent->action) {
754 	case XREP_DIRENT_ADD:
755 		/*
756 		 * Create a replacement dirent in the temporary directory.
757 		 * Note that _createname doesn't check for existing entries.
758 		 * There shouldn't be any in the temporary dir, but we'll
759 		 * verify this in debug mode.
760 		 */
761 #ifdef DEBUG
762 		error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
763 		if (error != -ENOENT) {
764 			ASSERT(error != -ENOENT);
765 			goto out_cancel;
766 		}
767 #endif
768 
769 		error = xrep_dir_replay_createname(rd, xname, dirent->ino,
770 				resblks);
771 		if (error)
772 			goto out_cancel;
773 
774 		if (xname->type == XFS_DIR3_FT_DIR)
775 			rd->subdirs++;
776 		rd->dirents++;
777 		break;
778 	case XREP_DIRENT_REMOVE:
779 		/*
780 		 * Remove a dirent from the temporary directory.  Note that
781 		 * _removename doesn't check the inode target of the exist
782 		 * entry.  There should be a perfect match in the temporary
783 		 * dir, but we'll verify this in debug mode.
784 		 */
785 #ifdef DEBUG
786 		error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
787 		if (error) {
788 			ASSERT(error != 0);
789 			goto out_cancel;
790 		}
791 		if (ino != dirent->ino) {
792 			ASSERT(ino == dirent->ino);
793 			error = -EIO;
794 			goto out_cancel;
795 		}
796 #endif
797 
798 		error = xrep_dir_replay_removename(rd, xname, resblks);
799 		if (error)
800 			goto out_cancel;
801 
802 		if (xname->type == XFS_DIR3_FT_DIR)
803 			rd->subdirs--;
804 		rd->dirents--;
805 		break;
806 	default:
807 		ASSERT(0);
808 		error = -EIO;
809 		goto out_cancel;
810 	}
811 
812 	/* Commit and unlock. */
813 	error = xrep_trans_commit(rd->sc);
814 	if (error)
815 		return error;
816 
817 	xrep_tempfile_iunlock(rd->sc);
818 	return 0;
819 out_cancel:
820 	xchk_trans_cancel(rd->sc);
821 	xrep_tempfile_iunlock(rd->sc);
822 	return error;
823 }
824 
825 /*
826  * Flush stashed incore dirent updates that have been recorded by the scanner.
827  * This is done to reduce the memory requirements of the directory rebuild,
828  * since directories can contain up to 32GB of directory data.
829  *
830  * Caller must not hold transactions or ILOCKs.  Caller must hold the tempdir
831  * IOLOCK.
832  */
833 STATIC int
834 xrep_dir_replay_updates(
835 	struct xrep_dir		*rd)
836 {
837 	xfarray_idx_t		array_cur;
838 	int			error;
839 
840 	/* Add all the salvaged dirents to the temporary directory. */
841 	mutex_lock(&rd->pscan.lock);
842 	foreach_xfarray_idx(rd->dir_entries, array_cur) {
843 		struct xrep_dirent	dirent;
844 
845 		error = xfarray_load(rd->dir_entries, array_cur, &dirent);
846 		if (error)
847 			goto out_unlock;
848 
849 		error = xfblob_loadname(rd->dir_names, dirent.name_cookie,
850 				&rd->xname, dirent.namelen);
851 		if (error)
852 			goto out_unlock;
853 		rd->xname.type = dirent.ftype;
854 		mutex_unlock(&rd->pscan.lock);
855 
856 		error = xrep_dir_replay_update(rd, &rd->xname, &dirent);
857 		if (error)
858 			return error;
859 		mutex_lock(&rd->pscan.lock);
860 	}
861 
862 	/* Empty out both arrays now that we've added the entries. */
863 	xfarray_truncate(rd->dir_entries);
864 	xfblob_truncate(rd->dir_names);
865 	mutex_unlock(&rd->pscan.lock);
866 	return 0;
867 out_unlock:
868 	mutex_unlock(&rd->pscan.lock);
869 	return error;
870 }
871 
872 /*
873  * Periodically flush stashed directory entries to the temporary dir.  This
874  * is done to reduce the memory requirements of the directory rebuild, since
875  * directories can contain up to 32GB of directory data.
876  */
877 STATIC int
878 xrep_dir_flush_stashed(
879 	struct xrep_dir		*rd)
880 {
881 	int			error;
882 
883 	/*
884 	 * Entering this function, the scrub context has a reference to the
885 	 * inode being repaired, the temporary file, and a scrub transaction
886 	 * that we use during dirent salvaging to avoid livelocking if there
887 	 * are cycles in the directory structures.  We hold ILOCK_EXCL on both
888 	 * the inode being repaired and the temporary file, though they are
889 	 * not ijoined to the scrub transaction.
890 	 *
891 	 * To constrain kernel memory use, we occasionally write salvaged
892 	 * dirents from the xfarray and xfblob structures into the temporary
893 	 * directory in preparation for exchanging the directory structures at
894 	 * the end.  Updating the temporary file requires a transaction, so we
895 	 * commit the scrub transaction and drop the two ILOCKs so that
896 	 * we can allocate whatever transaction we want.
897 	 *
898 	 * We still hold IOLOCK_EXCL on the inode being repaired, which
899 	 * prevents anyone from accessing the damaged directory data while we
900 	 * repair it.
901 	 */
902 	error = xrep_trans_commit(rd->sc);
903 	if (error)
904 		return error;
905 	xchk_iunlock(rd->sc, XFS_ILOCK_EXCL);
906 
907 	/*
908 	 * Take the IOLOCK of the temporary file while we modify dirents.  This
909 	 * isn't strictly required because the temporary file is never revealed
910 	 * to userspace, but we follow the same locking rules.  We still hold
911 	 * sc->ip's IOLOCK.
912 	 */
913 	error = xrep_tempfile_iolock_polled(rd->sc);
914 	if (error)
915 		return error;
916 
917 	/* Write to the tempdir all the updates that we've stashed. */
918 	error = xrep_dir_replay_updates(rd);
919 	xrep_tempfile_iounlock(rd->sc);
920 	if (error)
921 		return error;
922 
923 	/*
924 	 * Recreate the salvage transaction and relock the dir we're salvaging.
925 	 */
926 	error = xchk_trans_alloc(rd->sc, 0);
927 	if (error)
928 		return error;
929 	xchk_ilock(rd->sc, XFS_ILOCK_EXCL);
930 	return 0;
931 }
932 
933 /* Decide if we've stashed too much dirent data in memory. */
934 static inline bool
935 xrep_dir_want_flush_stashed(
936 	struct xrep_dir		*rd)
937 {
938 	unsigned long long	bytes;
939 
940 	bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names);
941 	return bytes > XREP_DIR_MAX_STASH_BYTES;
942 }
943 
944 /* Extract as many directory entries as we can. */
945 STATIC int
946 xrep_dir_recover(
947 	struct xrep_dir		*rd)
948 {
949 	struct xfs_bmbt_irec	got;
950 	struct xfs_scrub	*sc = rd->sc;
951 	struct xfs_da_geometry	*geo = sc->mp->m_dir_geo;
952 	xfs_fileoff_t		offset;
953 	xfs_dablk_t		dabno;
954 	__be32			magic_guess;
955 	int			nmap;
956 	int			error;
957 
958 	xrep_dir_guess_format(rd, &magic_guess);
959 
960 	/* Iterate each directory data block in the data fork. */
961 	for (offset = 0;
962 	     offset < geo->leafblk;
963 	     offset = got.br_startoff + got.br_blockcount) {
964 		nmap = 1;
965 		error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset,
966 				&got, &nmap, 0);
967 		if (error)
968 			return error;
969 		if (nmap != 1)
970 			return -EFSCORRUPTED;
971 		if (!xfs_bmap_is_written_extent(&got))
972 			continue;
973 
974 		for (dabno = round_up(got.br_startoff, geo->fsbcount);
975 		     dabno < got.br_startoff + got.br_blockcount;
976 		     dabno += geo->fsbcount) {
977 			if (xchk_should_terminate(rd->sc, &error))
978 				return error;
979 
980 			error = xrep_dir_recover_dirblock(rd,
981 					magic_guess, dabno);
982 			if (error)
983 				return error;
984 
985 			/* Flush dirents to constrain memory usage. */
986 			if (xrep_dir_want_flush_stashed(rd)) {
987 				error = xrep_dir_flush_stashed(rd);
988 				if (error)
989 					return error;
990 			}
991 		}
992 	}
993 
994 	return 0;
995 }
996 
997 /*
998  * Find all the directory entries for this inode by scraping them out of the
999  * directory leaf blocks by hand, and flushing them into the temp dir.
1000  */
1001 STATIC int
1002 xrep_dir_find_entries(
1003 	struct xrep_dir		*rd)
1004 {
1005 	struct xfs_inode	*dp = rd->sc->ip;
1006 	int			error;
1007 
1008 	/*
1009 	 * Salvage directory entries from the old directory, and write them to
1010 	 * the temporary directory.
1011 	 */
1012 	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
1013 		error = xrep_dir_recover_sf(rd);
1014 	} else {
1015 		error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK);
1016 		if (error)
1017 			return error;
1018 
1019 		error = xrep_dir_recover(rd);
1020 	}
1021 	if (error)
1022 		return error;
1023 
1024 	return xrep_dir_flush_stashed(rd);
1025 }
1026 
1027 /* Scan all files in the filesystem for dirents. */
1028 STATIC int
1029 xrep_dir_salvage_entries(
1030 	struct xrep_dir		*rd)
1031 {
1032 	struct xfs_scrub	*sc = rd->sc;
1033 	int			error;
1034 
1035 	/*
1036 	 * Drop the ILOCK on this directory so that we can scan for this
1037 	 * directory's parent.  Figure out who is going to be the parent of
1038 	 * this directory, then retake the ILOCK so that we can salvage
1039 	 * directory entries.
1040 	 */
1041 	xchk_iunlock(sc, XFS_ILOCK_EXCL);
1042 	error = xrep_dir_find_parent(rd);
1043 	xchk_ilock(sc, XFS_ILOCK_EXCL);
1044 	if (error)
1045 		return error;
1046 
1047 	/*
1048 	 * Collect directory entries by parsing raw leaf blocks to salvage
1049 	 * whatever we can.  When we're done, free the staging memory before
1050 	 * exchanging the directories to reduce memory usage.
1051 	 */
1052 	error = xrep_dir_find_entries(rd);
1053 	if (error)
1054 		return error;
1055 
1056 	/*
1057 	 * Cancel the repair transaction and drop the ILOCK so that we can
1058 	 * (later) use the atomic mapping exchange functions to compute the
1059 	 * correct block reservations and re-lock the inodes.
1060 	 *
1061 	 * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory
1062 	 * modifications, but there's nothing to prevent userspace from reading
1063 	 * the directory until we're ready for the exchange operation.  Reads
1064 	 * will return -EIO without shutting down the fs, so we're ok with
1065 	 * that.
1066 	 *
1067 	 * The VFS can change dotdot on us, but the findparent scan will keep
1068 	 * our incore parent inode up to date.  See the note on locking issues
1069 	 * for more details.
1070 	 */
1071 	error = xrep_trans_commit(sc);
1072 	if (error)
1073 		return error;
1074 
1075 	xchk_iunlock(sc, XFS_ILOCK_EXCL);
1076 	return 0;
1077 }
1078 
1079 
1080 /*
1081  * Examine a parent pointer of a file.  If it leads us back to the directory
1082  * that we're rebuilding, create an incore dirent from the parent pointer and
1083  * stash it.
1084  */
1085 STATIC int
1086 xrep_dir_scan_pptr(
1087 	struct xfs_scrub		*sc,
1088 	struct xfs_inode		*ip,
1089 	unsigned int			attr_flags,
1090 	const unsigned char		*name,
1091 	unsigned int			namelen,
1092 	const void			*value,
1093 	unsigned int			valuelen,
1094 	void				*priv)
1095 {
1096 	struct xfs_name			xname = {
1097 		.name			= name,
1098 		.len			= namelen,
1099 		.type			= xfs_mode_to_ftype(VFS_I(ip)->i_mode),
1100 	};
1101 	xfs_ino_t			parent_ino;
1102 	uint32_t			parent_gen;
1103 	struct xrep_dir			*rd = priv;
1104 	int				error;
1105 
1106 	if (!(attr_flags & XFS_ATTR_PARENT))
1107 		return 0;
1108 
1109 	/*
1110 	 * Ignore parent pointers that point back to a different dir, list the
1111 	 * wrong generation number, or are invalid.
1112 	 */
1113 	error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
1114 			valuelen, &parent_ino, &parent_gen);
1115 	if (error)
1116 		return error;
1117 
1118 	if (parent_ino != sc->ip->i_ino ||
1119 	    parent_gen != VFS_I(sc->ip)->i_generation)
1120 		return 0;
1121 
1122 	mutex_lock(&rd->pscan.lock);
1123 	error = xrep_dir_stash_createname(rd, &xname, ip->i_ino);
1124 	mutex_unlock(&rd->pscan.lock);
1125 	return error;
1126 }
1127 
1128 /*
1129  * If this child dirent points to the directory being repaired, remember that
1130  * fact so that we can reset the dotdot entry if necessary.
1131  */
1132 STATIC int
1133 xrep_dir_scan_dirent(
1134 	struct xfs_scrub	*sc,
1135 	struct xfs_inode	*dp,
1136 	xfs_dir2_dataptr_t	dapos,
1137 	const struct xfs_name	*name,
1138 	xfs_ino_t		ino,
1139 	void			*priv)
1140 {
1141 	struct xrep_dir		*rd = priv;
1142 
1143 	/* Dirent doesn't point to this directory. */
1144 	if (ino != rd->sc->ip->i_ino)
1145 		return 0;
1146 
1147 	/* Ignore garbage inum. */
1148 	if (!xfs_verify_dir_ino(rd->sc->mp, ino))
1149 		return 0;
1150 
1151 	/* No weird looking names. */
1152 	if (name->len >= MAXNAMELEN || name->len <= 0)
1153 		return 0;
1154 
1155 	/* Don't pick up dot or dotdot entries; we only want child dirents. */
1156 	if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
1157 	    xfs_dir2_samename(name, &xfs_name_dot))
1158 		return 0;
1159 
1160 	trace_xrep_dir_stash_createname(sc->tempip, &xfs_name_dotdot,
1161 			dp->i_ino);
1162 
1163 	xrep_findparent_scan_found(&rd->pscan, dp->i_ino);
1164 	return 0;
1165 }
1166 
1167 /*
1168  * Decide if we want to look for child dirents or parent pointers in this file.
1169  * Skip the dir being repaired and any files being used to stage repairs.
1170  */
1171 static inline bool
1172 xrep_dir_want_scan(
1173 	struct xrep_dir		*rd,
1174 	const struct xfs_inode	*ip)
1175 {
1176 	return ip != rd->sc->ip && !xrep_is_tempfile(ip);
1177 }
1178 
1179 /*
1180  * Take ILOCK on a file that we want to scan.
1181  *
1182  * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt or
1183  * has an unloaded attr bmbt.  Otherwise, take ILOCK_SHARED.
1184  */
1185 static inline unsigned int
1186 xrep_dir_scan_ilock(
1187 	struct xrep_dir		*rd,
1188 	struct xfs_inode	*ip)
1189 {
1190 	uint			lock_mode = XFS_ILOCK_SHARED;
1191 
1192 	/* Need to take the shared ILOCK to advance the iscan cursor. */
1193 	if (!xrep_dir_want_scan(rd, ip))
1194 		goto lock;
1195 
1196 	if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) {
1197 		lock_mode = XFS_ILOCK_EXCL;
1198 		goto lock;
1199 	}
1200 
1201 	if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
1202 		lock_mode = XFS_ILOCK_EXCL;
1203 
1204 lock:
1205 	xfs_ilock(ip, lock_mode);
1206 	return lock_mode;
1207 }
1208 
1209 /*
1210  * Scan this file for relevant child dirents or parent pointers that point to
1211  * the directory we're rebuilding.
1212  */
1213 STATIC int
1214 xrep_dir_scan_file(
1215 	struct xrep_dir		*rd,
1216 	struct xfs_inode	*ip)
1217 {
1218 	unsigned int		lock_mode;
1219 	int			error = 0;
1220 
1221 	lock_mode = xrep_dir_scan_ilock(rd, ip);
1222 
1223 	if (!xrep_dir_want_scan(rd, ip))
1224 		goto scan_done;
1225 
1226 	/*
1227 	 * If the extended attributes look as though they has been zapped by
1228 	 * the inode record repair code, we cannot scan for parent pointers.
1229 	 */
1230 	if (xchk_pptr_looks_zapped(ip)) {
1231 		error = -EBUSY;
1232 		goto scan_done;
1233 	}
1234 
1235 	error = xchk_xattr_walk(rd->sc, ip, xrep_dir_scan_pptr, NULL, rd);
1236 	if (error)
1237 		goto scan_done;
1238 
1239 	if (S_ISDIR(VFS_I(ip)->i_mode)) {
1240 		/*
1241 		 * If the directory looks as though it has been zapped by the
1242 		 * inode record repair code, we cannot scan for child dirents.
1243 		 */
1244 		if (xchk_dir_looks_zapped(ip)) {
1245 			error = -EBUSY;
1246 			goto scan_done;
1247 		}
1248 
1249 		error = xchk_dir_walk(rd->sc, ip, xrep_dir_scan_dirent, rd);
1250 		if (error)
1251 			goto scan_done;
1252 	}
1253 
1254 scan_done:
1255 	xchk_iscan_mark_visited(&rd->pscan.iscan, ip);
1256 	xfs_iunlock(ip, lock_mode);
1257 	return error;
1258 }
1259 
1260 /*
1261  * Scan all files in the filesystem for parent pointers that we can turn into
1262  * replacement dirents, and a dirent that we can use to set the dotdot pointer.
1263  */
1264 STATIC int
1265 xrep_dir_scan_dirtree(
1266 	struct xrep_dir		*rd)
1267 {
1268 	struct xfs_scrub	*sc = rd->sc;
1269 	struct xfs_inode	*ip;
1270 	int			error;
1271 
1272 	/* Roots of directory trees are their own parents. */
1273 	if (sc->ip == sc->mp->m_rootip)
1274 		xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino);
1275 
1276 	/*
1277 	 * Filesystem scans are time consuming.  Drop the directory ILOCK and
1278 	 * all other resources for the duration of the scan and hope for the
1279 	 * best.  The live update hooks will keep our scan information up to
1280 	 * date even though we've dropped the locks.
1281 	 */
1282 	xchk_trans_cancel(sc);
1283 	if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
1284 		xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
1285 						    XFS_ILOCK_EXCL));
1286 	error = xchk_trans_alloc_empty(sc);
1287 	if (error)
1288 		return error;
1289 
1290 	while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) {
1291 		bool		flush;
1292 
1293 		error = xrep_dir_scan_file(rd, ip);
1294 		xchk_irele(sc, ip);
1295 		if (error)
1296 			break;
1297 
1298 		/* Flush stashed dirent updates to constrain memory usage. */
1299 		mutex_lock(&rd->pscan.lock);
1300 		flush = xrep_dir_want_flush_stashed(rd);
1301 		mutex_unlock(&rd->pscan.lock);
1302 		if (flush) {
1303 			xchk_trans_cancel(sc);
1304 
1305 			error = xrep_tempfile_iolock_polled(sc);
1306 			if (error)
1307 				break;
1308 
1309 			error = xrep_dir_replay_updates(rd);
1310 			xrep_tempfile_iounlock(sc);
1311 			if (error)
1312 				break;
1313 
1314 			error = xchk_trans_alloc_empty(sc);
1315 			if (error)
1316 				break;
1317 		}
1318 
1319 		if (xchk_should_terminate(sc, &error))
1320 			break;
1321 	}
1322 	xchk_iscan_iter_finish(&rd->pscan.iscan);
1323 	if (error) {
1324 		/*
1325 		 * If we couldn't grab an inode that was busy with a state
1326 		 * change, change the error code so that we exit to userspace
1327 		 * as quickly as possible.
1328 		 */
1329 		if (error == -EBUSY)
1330 			return -ECANCELED;
1331 		return error;
1332 	}
1333 
1334 	/*
1335 	 * Cancel the empty transaction so that we can (later) use the atomic
1336 	 * file mapping exchange functions to lock files and commit the new
1337 	 * directory.
1338 	 */
1339 	xchk_trans_cancel(rd->sc);
1340 	return 0;
1341 }
1342 
1343 /*
1344  * Capture dirent updates being made by other threads which are relevant to the
1345  * directory being repaired.
1346  */
1347 STATIC int
1348 xrep_dir_live_update(
1349 	struct notifier_block		*nb,
1350 	unsigned long			action,
1351 	void				*data)
1352 {
1353 	struct xfs_dir_update_params	*p = data;
1354 	struct xrep_dir			*rd;
1355 	struct xfs_scrub		*sc;
1356 	int				error = 0;
1357 
1358 	rd = container_of(nb, struct xrep_dir, pscan.dhook.dirent_hook.nb);
1359 	sc = rd->sc;
1360 
1361 	/*
1362 	 * This thread updated a child dirent in the directory that we're
1363 	 * rebuilding.  Stash the update for replay against the temporary
1364 	 * directory.
1365 	 */
1366 	if (p->dp->i_ino == sc->ip->i_ino &&
1367 	    xchk_iscan_want_live_update(&rd->pscan.iscan, p->ip->i_ino)) {
1368 		mutex_lock(&rd->pscan.lock);
1369 		if (p->delta > 0)
1370 			error = xrep_dir_stash_createname(rd, p->name,
1371 					p->ip->i_ino);
1372 		else
1373 			error = xrep_dir_stash_removename(rd, p->name,
1374 					p->ip->i_ino);
1375 		mutex_unlock(&rd->pscan.lock);
1376 		if (error)
1377 			goto out_abort;
1378 	}
1379 
1380 	/*
1381 	 * This thread updated another directory's child dirent that points to
1382 	 * the directory that we're rebuilding, so remember the new dotdot
1383 	 * target.
1384 	 */
1385 	if (p->ip->i_ino == sc->ip->i_ino &&
1386 	    xchk_iscan_want_live_update(&rd->pscan.iscan, p->dp->i_ino)) {
1387 		if (p->delta > 0) {
1388 			trace_xrep_dir_stash_createname(sc->tempip,
1389 					&xfs_name_dotdot,
1390 					p->dp->i_ino);
1391 
1392 			xrep_findparent_scan_found(&rd->pscan, p->dp->i_ino);
1393 		} else {
1394 			trace_xrep_dir_stash_removename(sc->tempip,
1395 					&xfs_name_dotdot,
1396 					rd->pscan.parent_ino);
1397 
1398 			xrep_findparent_scan_found(&rd->pscan, NULLFSINO);
1399 		}
1400 	}
1401 
1402 	return NOTIFY_DONE;
1403 out_abort:
1404 	xchk_iscan_abort(&rd->pscan.iscan);
1405 	return NOTIFY_DONE;
1406 }
1407 
1408 /*
1409  * Free all the directory blocks and reset the data fork.  The caller must
1410  * join the inode to the transaction.  This function returns with the inode
1411  * joined to a clean scrub transaction.
1412  */
1413 STATIC int
1414 xrep_dir_reset_fork(
1415 	struct xrep_dir		*rd,
1416 	xfs_ino_t		parent_ino)
1417 {
1418 	struct xfs_scrub	*sc = rd->sc;
1419 	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK);
1420 	int			error;
1421 
1422 	/* Unmap all the directory buffers. */
1423 	if (xfs_ifork_has_extents(ifp)) {
1424 		error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
1425 		if (error)
1426 			return error;
1427 	}
1428 
1429 	trace_xrep_dir_reset_fork(sc->tempip, parent_ino);
1430 
1431 	/* Reset the data fork to an empty data fork. */
1432 	xfs_idestroy_fork(ifp);
1433 	ifp->if_bytes = 0;
1434 	sc->tempip->i_disk_size = 0;
1435 
1436 	/* Reinitialize the short form directory. */
1437 	xrep_dir_init_args(rd, sc->tempip, NULL);
1438 	return xfs_dir2_sf_create(&rd->args, parent_ino);
1439 }
1440 
1441 /*
1442  * Prepare both inodes' directory forks for exchanging mappings.  Promote the
1443  * tempfile from short format to leaf format, and if the file being repaired
1444  * has a short format data fork, turn it into an empty extent list.
1445  */
1446 STATIC int
1447 xrep_dir_swap_prep(
1448 	struct xfs_scrub	*sc,
1449 	bool			temp_local,
1450 	bool			ip_local)
1451 {
1452 	int			error;
1453 
1454 	/*
1455 	 * If the tempfile's directory is in shortform format, convert that to
1456 	 * a single leaf extent so that we can use the atomic mapping exchange.
1457 	 */
1458 	if (temp_local) {
1459 		struct xfs_da_args	args = {
1460 			.dp		= sc->tempip,
1461 			.geo		= sc->mp->m_dir_geo,
1462 			.whichfork	= XFS_DATA_FORK,
1463 			.trans		= sc->tp,
1464 			.total		= 1,
1465 			.owner		= sc->ip->i_ino,
1466 		};
1467 
1468 		error = xfs_dir2_sf_to_block(&args);
1469 		if (error)
1470 			return error;
1471 
1472 		/*
1473 		 * Roll the deferred log items to get us back to a clean
1474 		 * transaction.
1475 		 */
1476 		error = xfs_defer_finish(&sc->tp);
1477 		if (error)
1478 			return error;
1479 	}
1480 
1481 	/*
1482 	 * If the file being repaired had a shortform data fork, convert that
1483 	 * to an empty extent list in preparation for the atomic mapping
1484 	 * exchange.
1485 	 */
1486 	if (ip_local) {
1487 		struct xfs_ifork	*ifp;
1488 
1489 		ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1490 		xfs_idestroy_fork(ifp);
1491 		ifp->if_format = XFS_DINODE_FMT_EXTENTS;
1492 		ifp->if_nextents = 0;
1493 		ifp->if_bytes = 0;
1494 		ifp->if_data = NULL;
1495 		ifp->if_height = 0;
1496 
1497 		xfs_trans_log_inode(sc->tp, sc->ip,
1498 				XFS_ILOG_CORE | XFS_ILOG_DDATA);
1499 	}
1500 
1501 	return 0;
1502 }
1503 
1504 /*
1505  * Replace the inode number of a directory entry.
1506  */
1507 static int
1508 xrep_dir_replace(
1509 	struct xrep_dir		*rd,
1510 	struct xfs_inode	*dp,
1511 	const struct xfs_name	*name,
1512 	xfs_ino_t		inum,
1513 	xfs_extlen_t		total)
1514 {
1515 	struct xfs_scrub	*sc = rd->sc;
1516 	int			error;
1517 
1518 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
1519 
1520 	error = xfs_dir_ino_validate(sc->mp, inum);
1521 	if (error)
1522 		return error;
1523 
1524 	xrep_dir_init_args(rd, dp, name);
1525 	rd->args.inumber = inum;
1526 	rd->args.total = total;
1527 	return xfs_dir_replace_args(&rd->args);
1528 }
1529 
1530 /*
1531  * Reset the link count of this directory and adjust the unlinked list pointers
1532  * as needed.
1533  */
1534 STATIC int
1535 xrep_dir_set_nlink(
1536 	struct xrep_dir		*rd)
1537 {
1538 	struct xfs_scrub	*sc = rd->sc;
1539 	struct xfs_inode	*dp = sc->ip;
1540 	struct xfs_perag	*pag;
1541 	unsigned int		new_nlink = min_t(unsigned long long,
1542 						  rd->subdirs + 2,
1543 						  XFS_NLINK_PINNED);
1544 	int			error;
1545 
1546 	/*
1547 	 * The directory is not on the incore unlinked list, which means that
1548 	 * it needs to be reachable via the directory tree.  Update the nlink
1549 	 * with our observed link count.  If the directory has no parent, it
1550 	 * will be moved to the orphanage.
1551 	 */
1552 	if (!xfs_inode_on_unlinked_list(dp))
1553 		goto reset_nlink;
1554 
1555 	/*
1556 	 * The directory is on the unlinked list and we did not find any
1557 	 * dirents.  Set the link count to zero and let the directory
1558 	 * inactivate when the last reference drops.
1559 	 */
1560 	if (rd->dirents == 0) {
1561 		rd->needs_adoption = false;
1562 		new_nlink = 0;
1563 		goto reset_nlink;
1564 	}
1565 
1566 	/*
1567 	 * The directory is on the unlinked list and we found dirents.  This
1568 	 * directory needs to be reachable via the directory tree.  Remove the
1569 	 * dir from the unlinked list and update nlink with the observed link
1570 	 * count.  If the directory has no parent, it will be moved to the
1571 	 * orphanage.
1572 	 */
1573 	pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino));
1574 	if (!pag) {
1575 		ASSERT(0);
1576 		return -EFSCORRUPTED;
1577 	}
1578 
1579 	error = xfs_iunlink_remove(sc->tp, pag, dp);
1580 	xfs_perag_put(pag);
1581 	if (error)
1582 		return error;
1583 
1584 reset_nlink:
1585 	if (VFS_I(dp)->i_nlink != new_nlink)
1586 		set_nlink(VFS_I(dp), new_nlink);
1587 	return 0;
1588 }
1589 
1590 /*
1591  * Finish replaying stashed dirent updates, allocate a transaction for
1592  * exchanging data fork mappings, and take the ILOCKs of both directories
1593  * before we commit the new directory structure.
1594  */
1595 STATIC int
1596 xrep_dir_finalize_tempdir(
1597 	struct xrep_dir		*rd)
1598 {
1599 	struct xfs_scrub	*sc = rd->sc;
1600 	int			error;
1601 
1602 	if (!xfs_has_parent(sc->mp))
1603 		return xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
1604 
1605 	/*
1606 	 * Repair relies on the ILOCK to quiesce all possible dirent updates.
1607 	 * Replay all queued dirent updates into the tempdir before exchanging
1608 	 * the contents, even if that means dropping the ILOCKs and the
1609 	 * transaction.
1610 	 */
1611 	do {
1612 		error = xrep_dir_replay_updates(rd);
1613 		if (error)
1614 			return error;
1615 
1616 		error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
1617 		if (error)
1618 			return error;
1619 
1620 		if (xfarray_length(rd->dir_entries) == 0)
1621 			break;
1622 
1623 		xchk_trans_cancel(sc);
1624 		xrep_tempfile_iunlock_both(sc);
1625 	} while (!xchk_should_terminate(sc, &error));
1626 	return error;
1627 }
1628 
1629 /* Exchange the temporary directory's data fork with the one being repaired. */
1630 STATIC int
1631 xrep_dir_swap(
1632 	struct xrep_dir		*rd)
1633 {
1634 	struct xfs_scrub	*sc = rd->sc;
1635 	bool			ip_local, temp_local;
1636 	int			error = 0;
1637 
1638 	/*
1639 	 * If we never found the parent for this directory, temporarily assign
1640 	 * the root dir as the parent; we'll move this to the orphanage after
1641 	 * exchanging the dir contents.  We hold the ILOCK of the dir being
1642 	 * repaired, so we're not worried about racy updates of dotdot.
1643 	 */
1644 	ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
1645 	if (rd->pscan.parent_ino == NULLFSINO) {
1646 		rd->needs_adoption = true;
1647 		rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino;
1648 	}
1649 
1650 	/*
1651 	 * Reset the temporary directory's '..' entry to point to the parent
1652 	 * that we found.  The temporary directory was created with the root
1653 	 * directory as the parent, so we can skip this if repairing a
1654 	 * subdirectory of the root.
1655 	 *
1656 	 * It's also possible that this replacement could also expand a sf
1657 	 * tempdir into block format.
1658 	 */
1659 	if (rd->pscan.parent_ino != sc->mp->m_rootip->i_ino) {
1660 		error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot,
1661 				rd->pscan.parent_ino, rd->tx.req.resblks);
1662 		if (error)
1663 			return error;
1664 	}
1665 
1666 	/*
1667 	 * Changing the dot and dotdot entries could have changed the shape of
1668 	 * the directory, so we recompute these.
1669 	 */
1670 	ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
1671 	temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
1672 
1673 	/*
1674 	 * If the both files have a local format data fork and the rebuilt
1675 	 * directory data would fit in the repaired file's data fork, copy
1676 	 * the contents from the tempfile and update the directory link count.
1677 	 * We're done now.
1678 	 */
1679 	if (ip_local && temp_local &&
1680 	    sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) {
1681 		xrep_tempfile_copyout_local(sc, XFS_DATA_FORK);
1682 		return xrep_dir_set_nlink(rd);
1683 	}
1684 
1685 	/*
1686 	 * Clean the transaction before we start working on exchanging
1687 	 * directory contents.
1688 	 */
1689 	error = xrep_tempfile_roll_trans(rd->sc);
1690 	if (error)
1691 		return error;
1692 
1693 	/* Otherwise, make sure both data forks are in block-mapping mode. */
1694 	error = xrep_dir_swap_prep(sc, temp_local, ip_local);
1695 	if (error)
1696 		return error;
1697 
1698 	/*
1699 	 * Set nlink of the directory in the same transaction sequence that
1700 	 * (atomically) commits the new directory data.
1701 	 */
1702 	error = xrep_dir_set_nlink(rd);
1703 	if (error)
1704 		return error;
1705 
1706 	return xrep_tempexch_contents(sc, &rd->tx);
1707 }
1708 
1709 /*
1710  * Exchange the new directory contents (which we created in the tempfile) with
1711  * the directory being repaired.
1712  */
1713 STATIC int
1714 xrep_dir_rebuild_tree(
1715 	struct xrep_dir		*rd)
1716 {
1717 	struct xfs_scrub	*sc = rd->sc;
1718 	int			error;
1719 
1720 	trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino);
1721 
1722 	/*
1723 	 * Take the IOLOCK on the temporary file so that we can run dir
1724 	 * operations with the same locks held as we would for a normal file.
1725 	 * We still hold sc->ip's IOLOCK.
1726 	 */
1727 	error = xrep_tempfile_iolock_polled(rd->sc);
1728 	if (error)
1729 		return error;
1730 
1731 	/*
1732 	 * Allocate transaction, lock inodes, and make sure that we've replayed
1733 	 * all the stashed dirent updates to the tempdir.  After this point,
1734 	 * we're ready to exchange data fork mappings.
1735 	 */
1736 	error = xrep_dir_finalize_tempdir(rd);
1737 	if (error)
1738 		return error;
1739 
1740 	if (xchk_iscan_aborted(&rd->pscan.iscan))
1741 		return -ECANCELED;
1742 
1743 	/*
1744 	 * Exchange the tempdir's data fork with the file being repaired.  This
1745 	 * recreates the transaction and re-takes the ILOCK in the scrub
1746 	 * context.
1747 	 */
1748 	error = xrep_dir_swap(rd);
1749 	if (error)
1750 		return error;
1751 
1752 	/*
1753 	 * Release the old directory blocks and reset the data fork of the temp
1754 	 * directory to an empty shortform directory because inactivation does
1755 	 * nothing for directories.
1756 	 */
1757 	error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino);
1758 	if (error)
1759 		return error;
1760 
1761 	/*
1762 	 * Roll to get a transaction without any inodes joined to it.  Then we
1763 	 * can drop the tempfile's ILOCK and IOLOCK before doing more work on
1764 	 * the scrub target directory.
1765 	 */
1766 	error = xfs_trans_roll(&sc->tp);
1767 	if (error)
1768 		return error;
1769 
1770 	xrep_tempfile_iunlock(sc);
1771 	xrep_tempfile_iounlock(sc);
1772 	return 0;
1773 }
1774 
1775 /* Set up the filesystem scan so we can regenerate directory entries. */
1776 STATIC int
1777 xrep_dir_setup_scan(
1778 	struct xrep_dir		*rd)
1779 {
1780 	struct xfs_scrub	*sc = rd->sc;
1781 	char			*descr;
1782 	int			error;
1783 
1784 	/* Set up some staging memory for salvaging dirents. */
1785 	descr = xchk_xfile_ino_descr(sc, "directory entries");
1786 	error = xfarray_create(descr, 0, sizeof(struct xrep_dirent),
1787 			&rd->dir_entries);
1788 	kfree(descr);
1789 	if (error)
1790 		return error;
1791 
1792 	descr = xchk_xfile_ino_descr(sc, "directory entry names");
1793 	error = xfblob_create(descr, &rd->dir_names);
1794 	kfree(descr);
1795 	if (error)
1796 		goto out_xfarray;
1797 
1798 	if (xfs_has_parent(sc->mp))
1799 		error = __xrep_findparent_scan_start(sc, &rd->pscan,
1800 				xrep_dir_live_update);
1801 	else
1802 		error = xrep_findparent_scan_start(sc, &rd->pscan);
1803 	if (error)
1804 		goto out_xfblob;
1805 
1806 	return 0;
1807 
1808 out_xfblob:
1809 	xfblob_destroy(rd->dir_names);
1810 	rd->dir_names = NULL;
1811 out_xfarray:
1812 	xfarray_destroy(rd->dir_entries);
1813 	rd->dir_entries = NULL;
1814 	return error;
1815 }
1816 
1817 /*
1818  * Move the current file to the orphanage.
1819  *
1820  * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks.  Upon
1821  * successful return, the scrub transaction will have enough extra reservation
1822  * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the
1823  * orphanage; and both inodes will be ijoined.
1824  */
1825 STATIC int
1826 xrep_dir_move_to_orphanage(
1827 	struct xrep_dir		*rd)
1828 {
1829 	struct xfs_scrub	*sc = rd->sc;
1830 	xfs_ino_t		orig_parent, new_parent;
1831 	int			error;
1832 
1833 	/*
1834 	 * We are about to drop the ILOCK on sc->ip to lock the orphanage and
1835 	 * prepare for the adoption.  Therefore, look up the old dotdot entry
1836 	 * for sc->ip so that we can compare it after we re-lock sc->ip.
1837 	 */
1838 	error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent);
1839 	if (error)
1840 		return error;
1841 
1842 	/*
1843 	 * Drop the ILOCK on the scrub target and commit the transaction.
1844 	 * Adoption computes its own resource requirements and gathers the
1845 	 * necessary components.
1846 	 */
1847 	error = xrep_trans_commit(sc);
1848 	if (error)
1849 		return error;
1850 	xchk_iunlock(sc, XFS_ILOCK_EXCL);
1851 
1852 	/* If we can take the orphanage's iolock then we're ready to move. */
1853 	if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) {
1854 		xchk_iunlock(sc, sc->ilock_flags);
1855 		error = xrep_orphanage_iolock_two(sc);
1856 		if (error)
1857 			return error;
1858 	}
1859 
1860 	/* Grab transaction and ILOCK the two files. */
1861 	error = xrep_adoption_trans_alloc(sc, &rd->adoption);
1862 	if (error)
1863 		return error;
1864 
1865 	error = xrep_adoption_compute_name(&rd->adoption, &rd->xname);
1866 	if (error)
1867 		return error;
1868 
1869 	/*
1870 	 * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot
1871 	 * entry again.  If the parent changed or the child was unlinked while
1872 	 * the child directory was unlocked, we don't need to move the child to
1873 	 * the orphanage after all.
1874 	 */
1875 	error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent);
1876 	if (error)
1877 		return error;
1878 
1879 	/*
1880 	 * Attach to the orphanage if we still have a linked directory and it
1881 	 * hasn't been moved.
1882 	 */
1883 	if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) {
1884 		error = xrep_adoption_move(&rd->adoption);
1885 		if (error)
1886 			return error;
1887 	}
1888 
1889 	/*
1890 	 * Launder the scrub transaction so we can drop the orphanage ILOCK
1891 	 * and IOLOCK.  Return holding the scrub target's ILOCK and IOLOCK.
1892 	 */
1893 	error = xrep_adoption_trans_roll(&rd->adoption);
1894 	if (error)
1895 		return error;
1896 
1897 	xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
1898 	xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
1899 	return 0;
1900 }
1901 
1902 /*
1903  * Repair the directory metadata.
1904  *
1905  * XXX: Directory entry buffers can be multiple fsblocks in size.  The buffer
1906  * cache in XFS can't handle aliased multiblock buffers, so this might
1907  * misbehave if the directory blocks are crosslinked with other filesystem
1908  * metadata.
1909  *
1910  * XXX: Is it necessary to check the dcache for this directory to make sure
1911  * that we always recreate every cached entry?
1912  */
1913 int
1914 xrep_directory(
1915 	struct xfs_scrub	*sc)
1916 {
1917 	struct xrep_dir		*rd = sc->buf;
1918 	int			error;
1919 
1920 	/* The rmapbt is required to reap the old data fork. */
1921 	if (!xfs_has_rmapbt(sc->mp))
1922 		return -EOPNOTSUPP;
1923 	/* We require atomic file exchange range to rebuild anything. */
1924 	if (!xfs_has_exchange_range(sc->mp))
1925 		return -EOPNOTSUPP;
1926 
1927 	error = xrep_dir_setup_scan(rd);
1928 	if (error)
1929 		return error;
1930 
1931 	if (xfs_has_parent(sc->mp))
1932 		error = xrep_dir_scan_dirtree(rd);
1933 	else
1934 		error = xrep_dir_salvage_entries(rd);
1935 	if (error)
1936 		goto out_teardown;
1937 
1938 	/* Last chance to abort before we start committing fixes. */
1939 	if (xchk_should_terminate(sc, &error))
1940 		goto out_teardown;
1941 
1942 	error = xrep_dir_rebuild_tree(rd);
1943 	if (error)
1944 		goto out_teardown;
1945 
1946 	if (rd->needs_adoption) {
1947 		if (!xrep_orphanage_can_adopt(rd->sc))
1948 			error = -EFSCORRUPTED;
1949 		else
1950 			error = xrep_dir_move_to_orphanage(rd);
1951 		if (error)
1952 			goto out_teardown;
1953 	}
1954 
1955 out_teardown:
1956 	xrep_dir_teardown(sc);
1957 	return error;
1958 }
1959