xref: /linux/fs/xfs/scrub/dir_repair.c (revision 6f7e6393d1ce636bb7ec77a7fe7b77458fddf701)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (c) 2020-2024 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs_platform.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_bit.h"
14 #include "xfs_log_format.h"
15 #include "xfs_trans.h"
16 #include "xfs_sb.h"
17 #include "xfs_inode.h"
18 #include "xfs_icache.h"
19 #include "xfs_da_format.h"
20 #include "xfs_da_btree.h"
21 #include "xfs_dir2.h"
22 #include "xfs_dir2_priv.h"
23 #include "xfs_bmap.h"
24 #include "xfs_quota.h"
25 #include "xfs_bmap_btree.h"
26 #include "xfs_trans_space.h"
27 #include "xfs_bmap_util.h"
28 #include "xfs_exchmaps.h"
29 #include "xfs_exchrange.h"
30 #include "xfs_ag.h"
31 #include "xfs_parent.h"
32 #include "scrub/xfs_scrub.h"
33 #include "scrub/scrub.h"
34 #include "scrub/common.h"
35 #include "scrub/trace.h"
36 #include "scrub/repair.h"
37 #include "scrub/tempfile.h"
38 #include "scrub/tempexch.h"
39 #include "scrub/xfile.h"
40 #include "scrub/xfarray.h"
41 #include "scrub/xfblob.h"
42 #include "scrub/iscan.h"
43 #include "scrub/readdir.h"
44 #include "scrub/reap.h"
45 #include "scrub/findparent.h"
46 #include "scrub/orphanage.h"
47 #include "scrub/listxattr.h"
48 
49 /*
50  * Directory Repair
51  * ================
52  *
53  * We repair directories by reading the directory data blocks looking for
54  * directory entries that look salvageable (name passes verifiers, entry points
55  * to a valid allocated inode, etc).  Each entry worth salvaging is stashed in
56  * memory, and the stashed entries are periodically replayed into a temporary
57  * directory to constrain memory use.  Batching the construction of the
58  * temporary directory in this fashion reduces lock cycling of the directory
59  * being repaired and the temporary directory, and will later become important
60  * for parent pointer scanning.
61  *
62  * If parent pointers are enabled on this filesystem, we instead reconstruct
63  * the directory by visiting each parent pointer of each file in the filesystem
64  * and translating the relevant parent pointer records into dirents.  In this
65  * case, it is advantageous to stash all directory entries created from parent
66  * pointers for a single child file before replaying them into the temporary
67  * directory.  To save memory, the live filesystem scan reuses the findparent
68  * fields.  Directory repair chooses either parent pointer scanning or
69  * directory entry salvaging, but not both.
70  *
71  * Directory entries added to the temporary directory do not elevate the link
72  * counts of the inodes found.  When salvaging completes, the remaining stashed
73  * entries are replayed to the temporary directory.  An atomic mapping exchange
74  * is used to commit the new directory blocks to the directory being repaired.
75  * This will disrupt readdir cursors.
76  *
77  * Locking Issues
78  * --------------
79  *
80  * If /a, /a/b, and /c are all directories, the VFS does not take i_rwsem on
81  * /a/b for a "mv /a/b /c/" operation.  This means that only b's ILOCK protects
82  * b's dotdot update.  This is in contrast to every other dotdot update (link,
83  * remove, mkdir).  If the repair code drops the ILOCK, it must either
84  * revalidate the dotdot entry or use dirent hooks to capture updates from
85  * other threads.
86  */
87 
88 /* Create a dirent in the tempdir. */
89 #define XREP_DIRENT_ADD		(1)
90 
91 /* Remove a dirent from the tempdir. */
92 #define XREP_DIRENT_REMOVE	(2)
93 
94 /* Directory entry to be restored in the new directory. */
95 struct xrep_dirent {
96 	/* Cookie for retrieval of the dirent name. */
97 	xfblob_cookie		name_cookie;
98 
99 	/* Target inode number. */
100 	xfs_ino_t		ino;
101 
102 	/* Length of the dirent name. */
103 	uint8_t			namelen;
104 
105 	/* File type of the dirent. */
106 	uint8_t			ftype;
107 
108 	/* XREP_DIRENT_{ADD,REMOVE} */
109 	uint8_t			action;
110 };
111 
112 /*
113  * Stash up to 8 pages of recovered dirent data in dir_entries and dir_names
114  * before we write them to the temp dir.
115  */
116 #define XREP_DIR_MAX_STASH_BYTES	(PAGE_SIZE * 8)
117 
118 struct xrep_dir {
119 	struct xfs_scrub	*sc;
120 
121 	/* Fixed-size array of xrep_dirent structures. */
122 	struct xfarray		*dir_entries;
123 
124 	/* Blobs containing directory entry names. */
125 	struct xfblob		*dir_names;
126 
127 	/* Information for exchanging data forks at the end. */
128 	struct xrep_tempexch	tx;
129 
130 	/* Preallocated args struct for performing dir operations */
131 	struct xfs_da_args	args;
132 
133 	/*
134 	 * Information used to scan the filesystem to find the inumber of the
135 	 * dotdot entry for this directory.  For directory salvaging when
136 	 * parent pointers are not enabled, we use the findparent_* functions
137 	 * on this object and access only the parent_ino field directly.
138 	 *
139 	 * When parent pointers are enabled, however, the pptr scanner uses the
140 	 * iscan, hooks, lock, and parent_ino fields of this object directly.
141 	 * @pscan.lock coordinates access to dir_entries, dir_names,
142 	 * parent_ino, subdirs, dirents, and args.  This reduces the memory
143 	 * requirements of this structure.
144 	 */
145 	struct xrep_parent_scan_info pscan;
146 
147 	/*
148 	 * Context information for attaching this directory to the lost+found
149 	 * if this directory does not have a parent.
150 	 */
151 	struct xrep_adoption	adoption;
152 
153 	/* How many subdirectories did we find? */
154 	uint64_t		subdirs;
155 
156 	/* How many dirents did we find? */
157 	unsigned int		dirents;
158 
159 	/* Should we move this directory to the orphanage? */
160 	bool			needs_adoption;
161 
162 	/* Directory entry name, plus the trailing null. */
163 	struct xfs_name		xname;
164 	unsigned char		namebuf[MAXNAMELEN];
165 };
166 
167 /* Tear down all the incore stuff we created. */
168 static void
169 xrep_dir_teardown(
170 	struct xfs_scrub	*sc)
171 {
172 	struct xrep_dir		*rd = sc->buf;
173 
174 	xrep_findparent_scan_teardown(&rd->pscan);
175 	if (rd->dir_names)
176 		xfblob_destroy(rd->dir_names);
177 	rd->dir_names = NULL;
178 	if (rd->dir_entries)
179 		xfarray_destroy(rd->dir_entries);
180 	rd->dir_names = NULL;
181 }
182 
183 /* Set up for a directory repair. */
184 int
185 xrep_setup_directory(
186 	struct xfs_scrub	*sc)
187 {
188 	struct xrep_dir		*rd;
189 	int			error;
190 
191 	xchk_fsgates_enable(sc, XCHK_FSGATES_DIRENTS);
192 
193 	error = xrep_orphanage_try_create(sc);
194 	if (error)
195 		return error;
196 
197 	error = xrep_tempfile_create(sc, S_IFDIR);
198 	if (error)
199 		return error;
200 
201 	rd = kvzalloc(sizeof(struct xrep_dir), XCHK_GFP_FLAGS);
202 	if (!rd)
203 		return -ENOMEM;
204 	rd->sc = sc;
205 	rd->xname.name = rd->namebuf;
206 	sc->buf = rd;
207 
208 	return 0;
209 }
210 
211 /*
212  * Look up the dotdot entry and confirm that it's really the parent.
213  * Returns NULLFSINO if we don't know what to do.
214  */
215 static inline xfs_ino_t
216 xrep_dir_lookup_parent(
217 	struct xrep_dir		*rd)
218 {
219 	struct xfs_scrub	*sc = rd->sc;
220 	xfs_ino_t		ino;
221 	int			error;
222 
223 	error = xfs_dir_lookup(sc->tp, sc->ip, &xfs_name_dotdot, &ino, NULL);
224 	if (error)
225 		return NULLFSINO;
226 	if (!xfs_verify_dir_ino(sc->mp, ino))
227 		return NULLFSINO;
228 
229 	error = xrep_findparent_confirm(sc, &ino);
230 	if (error)
231 		return NULLFSINO;
232 
233 	return ino;
234 }
235 
236 /*
237  * Look up '..' in the dentry cache and confirm that it's really the parent.
238  * Returns NULLFSINO if the dcache misses or if the hit is implausible.
239  */
240 static inline xfs_ino_t
241 xrep_dir_dcache_parent(
242 	struct xrep_dir		*rd)
243 {
244 	struct xfs_scrub	*sc = rd->sc;
245 	xfs_ino_t		parent_ino;
246 	int			error;
247 
248 	parent_ino = xrep_findparent_from_dcache(sc);
249 	if (parent_ino == NULLFSINO)
250 		return parent_ino;
251 
252 	error = xrep_findparent_confirm(sc, &parent_ino);
253 	if (error)
254 		return NULLFSINO;
255 
256 	return parent_ino;
257 }
258 
259 /* Try to find the parent of the directory being repaired. */
260 STATIC int
261 xrep_dir_find_parent(
262 	struct xrep_dir		*rd)
263 {
264 	xfs_ino_t		ino;
265 
266 	ino = xrep_findparent_self_reference(rd->sc);
267 	if (ino != NULLFSINO) {
268 		xrep_findparent_scan_finish_early(&rd->pscan, ino);
269 		return 0;
270 	}
271 
272 	ino = xrep_dir_dcache_parent(rd);
273 	if (ino != NULLFSINO) {
274 		xrep_findparent_scan_finish_early(&rd->pscan, ino);
275 		return 0;
276 	}
277 
278 	ino = xrep_dir_lookup_parent(rd);
279 	if (ino != NULLFSINO) {
280 		xrep_findparent_scan_finish_early(&rd->pscan, ino);
281 		return 0;
282 	}
283 
284 	/*
285 	 * A full filesystem scan is the last resort.  On a busy filesystem,
286 	 * the scan can fail with -EBUSY if we cannot grab IOLOCKs.  That means
287 	 * that we don't know what who the parent is, so we should return to
288 	 * userspace.
289 	 */
290 	return xrep_findparent_scan(&rd->pscan);
291 }
292 
293 /*
294  * Decide if we want to salvage this entry.  We don't bother with oversized
295  * names or the dot entry.
296  */
297 STATIC int
298 xrep_dir_want_salvage(
299 	struct xrep_dir		*rd,
300 	const char		*name,
301 	int			namelen,
302 	xfs_ino_t		ino)
303 {
304 	struct xfs_mount	*mp = rd->sc->mp;
305 
306 	/* No pointers to ourselves or to garbage. */
307 	if (ino == rd->sc->ip->i_ino)
308 		return false;
309 	if (!xfs_verify_dir_ino(mp, ino))
310 		return false;
311 
312 	/* No weird looking names or dot entries. */
313 	if (namelen >= MAXNAMELEN || namelen <= 0)
314 		return false;
315 	if (namelen == 1 && name[0] == '.')
316 		return false;
317 	if (!xfs_dir2_namecheck(name, namelen))
318 		return false;
319 
320 	return true;
321 }
322 
323 /*
324  * Remember that we want to create a dirent in the tempdir.  These stashed
325  * actions will be replayed later.
326  */
327 STATIC int
328 xrep_dir_stash_createname(
329 	struct xrep_dir		*rd,
330 	const struct xfs_name	*name,
331 	xfs_ino_t		ino)
332 {
333 	struct xrep_dirent	dirent = {
334 		.action		= XREP_DIRENT_ADD,
335 		.ino		= ino,
336 		.namelen	= name->len,
337 		.ftype		= name->type,
338 	};
339 	int			error;
340 
341 	trace_xrep_dir_stash_createname(rd->sc->tempip, name, ino);
342 
343 	error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
344 	if (error)
345 		return error;
346 
347 	return xfarray_append(rd->dir_entries, &dirent);
348 }
349 
350 /*
351  * Remember that we want to remove a dirent from the tempdir.  These stashed
352  * actions will be replayed later.
353  */
354 STATIC int
355 xrep_dir_stash_removename(
356 	struct xrep_dir		*rd,
357 	const struct xfs_name	*name,
358 	xfs_ino_t		ino)
359 {
360 	struct xrep_dirent	dirent = {
361 		.action		= XREP_DIRENT_REMOVE,
362 		.ino		= ino,
363 		.namelen	= name->len,
364 		.ftype		= name->type,
365 	};
366 	int			error;
367 
368 	trace_xrep_dir_stash_removename(rd->sc->tempip, name, ino);
369 
370 	error = xfblob_storename(rd->dir_names, &dirent.name_cookie, name);
371 	if (error)
372 		return error;
373 
374 	return xfarray_append(rd->dir_entries, &dirent);
375 }
376 
377 /* Allocate an in-core record to hold entries while we rebuild the dir data. */
378 STATIC int
379 xrep_dir_salvage_entry(
380 	struct xrep_dir		*rd,
381 	unsigned char		*name,
382 	unsigned int		namelen,
383 	xfs_ino_t		ino)
384 {
385 	struct xfs_name		xname = {
386 		.name		= name,
387 	};
388 	struct xfs_scrub	*sc = rd->sc;
389 	struct xfs_inode	*ip;
390 	unsigned int		i = 0;
391 	int			error = 0;
392 
393 	if (xchk_should_terminate(sc, &error))
394 		return error;
395 
396 	/*
397 	 * Truncate the name to the first character that would trip namecheck.
398 	 * If we no longer have a name after that, ignore this entry.
399 	 */
400 	while (i < namelen && name[i] != 0 && name[i] != '/')
401 		i++;
402 	if (i == 0)
403 		return 0;
404 	xname.len = i;
405 
406 	/* Ignore '..' entries; we already picked the new parent. */
407 	if (xname.len == 2 && name[0] == '.' && name[1] == '.') {
408 		trace_xrep_dir_salvaged_parent(sc->ip, ino);
409 		return 0;
410 	}
411 
412 	trace_xrep_dir_salvage_entry(sc->ip, &xname, ino);
413 
414 	/*
415 	 * Compute the ftype or dump the entry if we can't.  We don't lock the
416 	 * inode because inodes can't change type while we have a reference.
417 	 */
418 	error = xchk_iget(sc, ino, &ip);
419 	if (error)
420 		return 0;
421 
422 	/* Don't mix metadata and regular directory trees. */
423 	if (xfs_is_metadir_inode(ip) != xfs_is_metadir_inode(rd->sc->ip)) {
424 		xchk_irele(sc, ip);
425 		return 0;
426 	}
427 
428 	xname.type = xfs_mode_to_ftype(VFS_I(ip)->i_mode);
429 	xchk_irele(sc, ip);
430 
431 	return xrep_dir_stash_createname(rd, &xname, ino);
432 }
433 
434 /* Record a shortform directory entry for later reinsertion. */
435 STATIC int
436 xrep_dir_salvage_sf_entry(
437 	struct xrep_dir			*rd,
438 	struct xfs_dir2_sf_hdr		*sfp,
439 	struct xfs_dir2_sf_entry	*sfep)
440 {
441 	xfs_ino_t			ino;
442 
443 	ino = xfs_dir2_sf_get_ino(rd->sc->mp, sfp, sfep);
444 	if (!xrep_dir_want_salvage(rd, sfep->name, sfep->namelen, ino))
445 		return 0;
446 
447 	return xrep_dir_salvage_entry(rd, sfep->name, sfep->namelen, ino);
448 }
449 
450 /* Record a regular directory entry for later reinsertion. */
451 STATIC int
452 xrep_dir_salvage_data_entry(
453 	struct xrep_dir			*rd,
454 	struct xfs_dir2_data_entry	*dep)
455 {
456 	xfs_ino_t			ino;
457 
458 	ino = be64_to_cpu(dep->inumber);
459 	if (!xrep_dir_want_salvage(rd, dep->name, dep->namelen, ino))
460 		return 0;
461 
462 	return xrep_dir_salvage_entry(rd, dep->name, dep->namelen, ino);
463 }
464 
465 /* Try to recover block/data format directory entries. */
466 STATIC int
467 xrep_dir_recover_data(
468 	struct xrep_dir		*rd,
469 	struct xfs_buf		*bp)
470 {
471 	struct xfs_da_geometry	*geo = rd->sc->mp->m_dir_geo;
472 	unsigned int		offset;
473 	unsigned int		end;
474 	int			error = 0;
475 
476 	/*
477 	 * Loop over the data portion of the block.
478 	 * Each object is a real entry (dep) or an unused one (dup).
479 	 */
480 	offset = geo->data_entry_offset;
481 	end = min_t(unsigned int, BBTOB(bp->b_length),
482 			xfs_dir3_data_end_offset(geo, bp->b_addr));
483 
484 	while (offset < end) {
485 		struct xfs_dir2_data_unused	*dup = bp->b_addr + offset;
486 		struct xfs_dir2_data_entry	*dep = bp->b_addr + offset;
487 
488 		if (xchk_should_terminate(rd->sc, &error))
489 			return error;
490 
491 		/* Skip unused entries. */
492 		if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
493 			offset += be16_to_cpu(dup->length);
494 			continue;
495 		}
496 
497 		/* Don't walk off the end of the block. */
498 		offset += xfs_dir2_data_entsize(rd->sc->mp, dep->namelen);
499 		if (offset > end)
500 			break;
501 
502 		/* Ok, let's save this entry. */
503 		error = xrep_dir_salvage_data_entry(rd, dep);
504 		if (error)
505 			return error;
506 
507 	}
508 
509 	return 0;
510 }
511 
512 /* Try to recover shortform directory entries. */
513 STATIC int
514 xrep_dir_recover_sf(
515 	struct xrep_dir			*rd)
516 {
517 	struct xfs_dir2_sf_hdr		*hdr;
518 	struct xfs_dir2_sf_entry	*sfep;
519 	struct xfs_dir2_sf_entry	*next;
520 	struct xfs_ifork		*ifp;
521 	xfs_ino_t			ino;
522 	unsigned char			*end;
523 	int				error = 0;
524 
525 	ifp = xfs_ifork_ptr(rd->sc->ip, XFS_DATA_FORK);
526 	hdr = ifp->if_data;
527 	end = (unsigned char *)ifp->if_data + ifp->if_bytes;
528 
529 	ino = xfs_dir2_sf_get_parent_ino(hdr);
530 	trace_xrep_dir_salvaged_parent(rd->sc->ip, ino);
531 
532 	sfep = xfs_dir2_sf_firstentry(hdr);
533 	while ((unsigned char *)sfep < end) {
534 		if (xchk_should_terminate(rd->sc, &error))
535 			return error;
536 
537 		next = xfs_dir2_sf_nextentry(rd->sc->mp, hdr, sfep);
538 		if ((unsigned char *)next > end)
539 			break;
540 
541 		/* Ok, let's save this entry. */
542 		error = xrep_dir_salvage_sf_entry(rd, hdr, sfep);
543 		if (error)
544 			return error;
545 
546 		sfep = next;
547 	}
548 
549 	return 0;
550 }
551 
552 /*
553  * Try to figure out the format of this directory from the data fork mappings
554  * and the directory size.  If we can be reasonably sure of format, we can be
555  * more aggressive in salvaging directory entries.  On return, @magic_guess
556  * will be set to DIR3_BLOCK_MAGIC if we think this is a "block format"
557  * directory; DIR3_DATA_MAGIC if we think this is a "data format" directory,
558  * and 0 if we can't tell.
559  */
560 STATIC void
561 xrep_dir_guess_format(
562 	struct xrep_dir		*rd,
563 	__be32			*magic_guess)
564 {
565 	struct xfs_inode	*dp = rd->sc->ip;
566 	struct xfs_mount	*mp = rd->sc->mp;
567 	struct xfs_da_geometry	*geo = mp->m_dir_geo;
568 	xfs_fileoff_t		last;
569 	int			error;
570 
571 	ASSERT(xfs_has_crc(mp));
572 
573 	*magic_guess = 0;
574 
575 	/*
576 	 * If there's a single directory block and the directory size is
577 	 * exactly one block, this has to be a single block format directory.
578 	 */
579 	error = xfs_bmap_last_offset(dp, &last, XFS_DATA_FORK);
580 	if (!error && XFS_FSB_TO_B(mp, last) == geo->blksize &&
581 	    dp->i_disk_size == geo->blksize) {
582 		*magic_guess = cpu_to_be32(XFS_DIR3_BLOCK_MAGIC);
583 		return;
584 	}
585 
586 	/*
587 	 * If the last extent before the leaf offset matches the directory
588 	 * size and the directory size is larger than 1 block, this is a
589 	 * data format directory.
590 	 */
591 	last = geo->leafblk;
592 	error = xfs_bmap_last_before(rd->sc->tp, dp, &last, XFS_DATA_FORK);
593 	if (!error &&
594 	    XFS_FSB_TO_B(mp, last) > geo->blksize &&
595 	    XFS_FSB_TO_B(mp, last) == dp->i_disk_size) {
596 		*magic_guess = cpu_to_be32(XFS_DIR3_DATA_MAGIC);
597 		return;
598 	}
599 }
600 
601 /* Recover directory entries from a specific directory block. */
602 STATIC int
603 xrep_dir_recover_dirblock(
604 	struct xrep_dir		*rd,
605 	__be32			magic_guess,
606 	xfs_dablk_t		dabno)
607 {
608 	struct xfs_dir2_data_hdr *hdr;
609 	struct xfs_buf		*bp;
610 	__be32			oldmagic;
611 	int			error;
612 
613 	/*
614 	 * Try to read buffer.  We invalidate them in the next step so we don't
615 	 * bother to set a buffer type or ops.
616 	 */
617 	error = xfs_da_read_buf(rd->sc->tp, rd->sc->ip, dabno,
618 			XFS_DABUF_MAP_HOLE_OK, &bp, XFS_DATA_FORK, NULL);
619 	if (error || !bp)
620 		return error;
621 
622 	hdr = bp->b_addr;
623 	oldmagic = hdr->magic;
624 
625 	trace_xrep_dir_recover_dirblock(rd->sc->ip, dabno,
626 			be32_to_cpu(hdr->magic), be32_to_cpu(magic_guess));
627 
628 	/*
629 	 * If we're sure of the block's format, proceed with the salvage
630 	 * operation using the specified magic number.
631 	 */
632 	if (magic_guess) {
633 		hdr->magic = magic_guess;
634 		goto recover;
635 	}
636 
637 	/*
638 	 * If we couldn't guess what type of directory this is, then we will
639 	 * only salvage entries from directory blocks that match the magic
640 	 * number and pass verifiers.
641 	 */
642 	switch (hdr->magic) {
643 	case cpu_to_be32(XFS_DIR2_BLOCK_MAGIC):
644 	case cpu_to_be32(XFS_DIR3_BLOCK_MAGIC):
645 		if (!xrep_buf_verify_struct(bp, &xfs_dir3_block_buf_ops))
646 			goto out;
647 		if (xfs_dir3_block_header_check(bp, rd->sc->ip->i_ino) != NULL)
648 			goto out;
649 		break;
650 	case cpu_to_be32(XFS_DIR2_DATA_MAGIC):
651 	case cpu_to_be32(XFS_DIR3_DATA_MAGIC):
652 		if (!xrep_buf_verify_struct(bp, &xfs_dir3_data_buf_ops))
653 			goto out;
654 		if (xfs_dir3_data_header_check(bp, rd->sc->ip->i_ino) != NULL)
655 			goto out;
656 		break;
657 	default:
658 		goto out;
659 	}
660 
661 recover:
662 	error = xrep_dir_recover_data(rd, bp);
663 
664 out:
665 	hdr->magic = oldmagic;
666 	xfs_trans_brelse(rd->sc->tp, bp);
667 	return error;
668 }
669 
670 static inline void
671 xrep_dir_init_args(
672 	struct xrep_dir		*rd,
673 	struct xfs_inode	*dp,
674 	const struct xfs_name	*name)
675 {
676 	memset(&rd->args, 0, sizeof(struct xfs_da_args));
677 	rd->args.geo = rd->sc->mp->m_dir_geo;
678 	rd->args.whichfork = XFS_DATA_FORK;
679 	rd->args.owner = rd->sc->ip->i_ino;
680 	rd->args.trans = rd->sc->tp;
681 	rd->args.dp = dp;
682 	if (!name)
683 		return;
684 	rd->args.name = name->name;
685 	rd->args.namelen = name->len;
686 	rd->args.filetype = name->type;
687 	rd->args.hashval = xfs_dir2_hashname(rd->sc->mp, name);
688 }
689 
690 /* Replay a stashed createname into the temporary directory. */
691 STATIC int
692 xrep_dir_replay_createname(
693 	struct xrep_dir		*rd,
694 	const struct xfs_name	*name,
695 	xfs_ino_t		inum,
696 	xfs_extlen_t		total)
697 {
698 	struct xfs_scrub	*sc = rd->sc;
699 	struct xfs_inode	*dp = rd->sc->tempip;
700 	int			error;
701 
702 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
703 
704 	error = xfs_dir_ino_validate(sc->mp, inum);
705 	if (error)
706 		return error;
707 
708 	trace_xrep_dir_replay_createname(dp, name, inum);
709 
710 	xrep_dir_init_args(rd, dp, name);
711 	rd->args.inumber = inum;
712 	rd->args.total = total;
713 	rd->args.op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT;
714 	return xfs_dir_createname_args(&rd->args);
715 }
716 
717 /* Replay a stashed removename onto the temporary directory. */
718 STATIC int
719 xrep_dir_replay_removename(
720 	struct xrep_dir		*rd,
721 	const struct xfs_name	*name,
722 	xfs_extlen_t		total)
723 {
724 	struct xfs_inode	*dp = rd->args.dp;
725 
726 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
727 
728 	xrep_dir_init_args(rd, dp, name);
729 	rd->args.op_flags = 0;
730 	rd->args.total = total;
731 
732 	trace_xrep_dir_replay_removename(dp, name, 0);
733 	return xfs_dir_removename_args(&rd->args);
734 }
735 
736 /*
737  * Add this stashed incore directory entry to the temporary directory.
738  * The caller must hold the tempdir's IOLOCK, must not hold any ILOCKs, and
739  * must not be in transaction context.
740  */
741 STATIC int
742 xrep_dir_replay_update(
743 	struct xrep_dir			*rd,
744 	const struct xfs_name		*xname,
745 	const struct xrep_dirent	*dirent)
746 {
747 	struct xfs_mount		*mp = rd->sc->mp;
748 #ifdef DEBUG
749 	xfs_ino_t			ino;
750 #endif
751 	uint				resblks;
752 	int				error;
753 
754 	resblks = xfs_link_space_res(mp, xname->len);
755 	error = xchk_trans_alloc(rd->sc, resblks);
756 	if (error)
757 		return error;
758 
759 	/* Lock the temporary directory and join it to the transaction */
760 	xrep_tempfile_ilock(rd->sc);
761 	xfs_trans_ijoin(rd->sc->tp, rd->sc->tempip, 0);
762 
763 	switch (dirent->action) {
764 	case XREP_DIRENT_ADD:
765 		/*
766 		 * Create a replacement dirent in the temporary directory.
767 		 * Note that _createname doesn't check for existing entries.
768 		 * There shouldn't be any in the temporary dir, but we'll
769 		 * verify this in debug mode.
770 		 */
771 #ifdef DEBUG
772 		error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
773 		if (error != -ENOENT) {
774 			ASSERT(error != -ENOENT);
775 			goto out_cancel;
776 		}
777 #endif
778 
779 		error = xrep_dir_replay_createname(rd, xname, dirent->ino,
780 				resblks);
781 		if (error)
782 			goto out_cancel;
783 
784 		if (xname->type == XFS_DIR3_FT_DIR)
785 			rd->subdirs++;
786 		rd->dirents++;
787 		break;
788 	case XREP_DIRENT_REMOVE:
789 		/*
790 		 * Remove a dirent from the temporary directory.  Note that
791 		 * _removename doesn't check the inode target of the exist
792 		 * entry.  There should be a perfect match in the temporary
793 		 * dir, but we'll verify this in debug mode.
794 		 */
795 #ifdef DEBUG
796 		error = xchk_dir_lookup(rd->sc, rd->sc->tempip, xname, &ino);
797 		if (error) {
798 			ASSERT(error != 0);
799 			goto out_cancel;
800 		}
801 		if (ino != dirent->ino) {
802 			ASSERT(ino == dirent->ino);
803 			error = -EIO;
804 			goto out_cancel;
805 		}
806 #endif
807 
808 		error = xrep_dir_replay_removename(rd, xname, resblks);
809 		if (error)
810 			goto out_cancel;
811 
812 		if (xname->type == XFS_DIR3_FT_DIR)
813 			rd->subdirs--;
814 		rd->dirents--;
815 		break;
816 	default:
817 		ASSERT(0);
818 		error = -EIO;
819 		goto out_cancel;
820 	}
821 
822 	/* Commit and unlock. */
823 	error = xrep_trans_commit(rd->sc);
824 	if (error)
825 		return error;
826 
827 	xrep_tempfile_iunlock(rd->sc);
828 	return 0;
829 out_cancel:
830 	xchk_trans_cancel(rd->sc);
831 	xrep_tempfile_iunlock(rd->sc);
832 	return error;
833 }
834 
835 /*
836  * Flush stashed incore dirent updates that have been recorded by the scanner.
837  * This is done to reduce the memory requirements of the directory rebuild,
838  * since directories can contain up to 32GB of directory data.
839  *
840  * Caller must not hold transactions or ILOCKs.  Caller must hold the tempdir
841  * IOLOCK.
842  */
843 STATIC int
844 xrep_dir_replay_updates(
845 	struct xrep_dir		*rd)
846 {
847 	xfarray_idx_t		array_cur;
848 	int			error;
849 
850 	/* Add all the salvaged dirents to the temporary directory. */
851 	mutex_lock(&rd->pscan.lock);
852 	foreach_xfarray_idx(rd->dir_entries, array_cur) {
853 		struct xrep_dirent	dirent;
854 
855 		error = xfarray_load(rd->dir_entries, array_cur, &dirent);
856 		if (error)
857 			goto out_unlock;
858 
859 		error = xfblob_loadname(rd->dir_names, dirent.name_cookie,
860 				&rd->xname, dirent.namelen);
861 		if (error)
862 			goto out_unlock;
863 		rd->xname.type = dirent.ftype;
864 		mutex_unlock(&rd->pscan.lock);
865 
866 		error = xrep_dir_replay_update(rd, &rd->xname, &dirent);
867 		if (error)
868 			return error;
869 		mutex_lock(&rd->pscan.lock);
870 	}
871 
872 	/* Empty out both arrays now that we've added the entries. */
873 	xfarray_truncate(rd->dir_entries);
874 	xfblob_truncate(rd->dir_names);
875 	mutex_unlock(&rd->pscan.lock);
876 	return 0;
877 out_unlock:
878 	mutex_unlock(&rd->pscan.lock);
879 	return error;
880 }
881 
882 /*
883  * Periodically flush stashed directory entries to the temporary dir.  This
884  * is done to reduce the memory requirements of the directory rebuild, since
885  * directories can contain up to 32GB of directory data.
886  */
887 STATIC int
888 xrep_dir_flush_stashed(
889 	struct xrep_dir		*rd)
890 {
891 	int			error;
892 
893 	/*
894 	 * Entering this function, the scrub context has a reference to the
895 	 * inode being repaired, the temporary file, and a scrub transaction
896 	 * that we use during dirent salvaging to avoid livelocking if there
897 	 * are cycles in the directory structures.  We hold ILOCK_EXCL on both
898 	 * the inode being repaired and the temporary file, though they are
899 	 * not ijoined to the scrub transaction.
900 	 *
901 	 * To constrain kernel memory use, we occasionally write salvaged
902 	 * dirents from the xfarray and xfblob structures into the temporary
903 	 * directory in preparation for exchanging the directory structures at
904 	 * the end.  Updating the temporary file requires a transaction, so we
905 	 * commit the scrub transaction and drop the two ILOCKs so that
906 	 * we can allocate whatever transaction we want.
907 	 *
908 	 * We still hold IOLOCK_EXCL on the inode being repaired, which
909 	 * prevents anyone from accessing the damaged directory data while we
910 	 * repair it.
911 	 */
912 	error = xrep_trans_commit(rd->sc);
913 	if (error)
914 		return error;
915 	xchk_iunlock(rd->sc, XFS_ILOCK_EXCL);
916 
917 	/*
918 	 * Take the IOLOCK of the temporary file while we modify dirents.  This
919 	 * isn't strictly required because the temporary file is never revealed
920 	 * to userspace, but we follow the same locking rules.  We still hold
921 	 * sc->ip's IOLOCK.
922 	 */
923 	error = xrep_tempfile_iolock_polled(rd->sc);
924 	if (error)
925 		return error;
926 
927 	/* Write to the tempdir all the updates that we've stashed. */
928 	error = xrep_dir_replay_updates(rd);
929 	xrep_tempfile_iounlock(rd->sc);
930 	if (error)
931 		return error;
932 
933 	/*
934 	 * Recreate the salvage transaction and relock the dir we're salvaging.
935 	 */
936 	error = xchk_trans_alloc(rd->sc, 0);
937 	if (error)
938 		return error;
939 	xchk_ilock(rd->sc, XFS_ILOCK_EXCL);
940 	return 0;
941 }
942 
943 /* Decide if we've stashed too much dirent data in memory. */
944 static inline bool
945 xrep_dir_want_flush_stashed(
946 	struct xrep_dir		*rd)
947 {
948 	unsigned long long	bytes;
949 
950 	bytes = xfarray_bytes(rd->dir_entries) + xfblob_bytes(rd->dir_names);
951 	return bytes > XREP_DIR_MAX_STASH_BYTES;
952 }
953 
954 /* Extract as many directory entries as we can. */
955 STATIC int
956 xrep_dir_recover(
957 	struct xrep_dir		*rd)
958 {
959 	struct xfs_bmbt_irec	got;
960 	struct xfs_scrub	*sc = rd->sc;
961 	struct xfs_da_geometry	*geo = sc->mp->m_dir_geo;
962 	xfs_fileoff_t		offset;
963 	xfs_dablk_t		dabno;
964 	__be32			magic_guess;
965 	int			nmap;
966 	int			error;
967 
968 	xrep_dir_guess_format(rd, &magic_guess);
969 
970 	/* Iterate each directory data block in the data fork. */
971 	for (offset = 0;
972 	     offset < geo->leafblk;
973 	     offset = got.br_startoff + got.br_blockcount) {
974 		nmap = 1;
975 		error = xfs_bmapi_read(sc->ip, offset, geo->leafblk - offset,
976 				&got, &nmap, 0);
977 		if (error)
978 			return error;
979 		if (nmap != 1)
980 			return -EFSCORRUPTED;
981 		if (!xfs_bmap_is_written_extent(&got))
982 			continue;
983 
984 		for (dabno = round_up(got.br_startoff, geo->fsbcount);
985 		     dabno < got.br_startoff + got.br_blockcount;
986 		     dabno += geo->fsbcount) {
987 			if (xchk_should_terminate(rd->sc, &error))
988 				return error;
989 
990 			error = xrep_dir_recover_dirblock(rd,
991 					magic_guess, dabno);
992 			if (error)
993 				return error;
994 
995 			/* Flush dirents to constrain memory usage. */
996 			if (xrep_dir_want_flush_stashed(rd)) {
997 				error = xrep_dir_flush_stashed(rd);
998 				if (error)
999 					return error;
1000 			}
1001 		}
1002 	}
1003 
1004 	return 0;
1005 }
1006 
1007 /*
1008  * Find all the directory entries for this inode by scraping them out of the
1009  * directory leaf blocks by hand, and flushing them into the temp dir.
1010  */
1011 STATIC int
1012 xrep_dir_find_entries(
1013 	struct xrep_dir		*rd)
1014 {
1015 	struct xfs_inode	*dp = rd->sc->ip;
1016 	int			error;
1017 
1018 	/*
1019 	 * Salvage directory entries from the old directory, and write them to
1020 	 * the temporary directory.
1021 	 */
1022 	if (dp->i_df.if_format == XFS_DINODE_FMT_LOCAL) {
1023 		error = xrep_dir_recover_sf(rd);
1024 	} else {
1025 		error = xfs_iread_extents(rd->sc->tp, dp, XFS_DATA_FORK);
1026 		if (error)
1027 			return error;
1028 
1029 		error = xrep_dir_recover(rd);
1030 	}
1031 	if (error)
1032 		return error;
1033 
1034 	return xrep_dir_flush_stashed(rd);
1035 }
1036 
1037 /* Scan all files in the filesystem for dirents. */
1038 STATIC int
1039 xrep_dir_salvage_entries(
1040 	struct xrep_dir		*rd)
1041 {
1042 	struct xfs_scrub	*sc = rd->sc;
1043 	int			error;
1044 
1045 	/*
1046 	 * Drop the ILOCK on this directory so that we can scan for this
1047 	 * directory's parent.  Figure out who is going to be the parent of
1048 	 * this directory, then retake the ILOCK so that we can salvage
1049 	 * directory entries.
1050 	 */
1051 	xchk_iunlock(sc, XFS_ILOCK_EXCL);
1052 	error = xrep_dir_find_parent(rd);
1053 	xchk_ilock(sc, XFS_ILOCK_EXCL);
1054 	if (error)
1055 		return error;
1056 
1057 	/*
1058 	 * Collect directory entries by parsing raw leaf blocks to salvage
1059 	 * whatever we can.  When we're done, free the staging memory before
1060 	 * exchanging the directories to reduce memory usage.
1061 	 */
1062 	error = xrep_dir_find_entries(rd);
1063 	if (error)
1064 		return error;
1065 
1066 	/*
1067 	 * Cancel the repair transaction and drop the ILOCK so that we can
1068 	 * (later) use the atomic mapping exchange functions to compute the
1069 	 * correct block reservations and re-lock the inodes.
1070 	 *
1071 	 * We still hold IOLOCK_EXCL (aka i_rwsem) which will prevent directory
1072 	 * modifications, but there's nothing to prevent userspace from reading
1073 	 * the directory until we're ready for the exchange operation.  Reads
1074 	 * will return -EIO without shutting down the fs, so we're ok with
1075 	 * that.
1076 	 *
1077 	 * The VFS can change dotdot on us, but the findparent scan will keep
1078 	 * our incore parent inode up to date.  See the note on locking issues
1079 	 * for more details.
1080 	 */
1081 	error = xrep_trans_commit(sc);
1082 	if (error)
1083 		return error;
1084 
1085 	xchk_iunlock(sc, XFS_ILOCK_EXCL);
1086 	return 0;
1087 }
1088 
1089 
1090 /*
1091  * Examine a parent pointer of a file.  If it leads us back to the directory
1092  * that we're rebuilding, create an incore dirent from the parent pointer and
1093  * stash it.
1094  */
1095 STATIC int
1096 xrep_dir_scan_pptr(
1097 	struct xfs_scrub		*sc,
1098 	struct xfs_inode		*ip,
1099 	unsigned int			attr_flags,
1100 	const unsigned char		*name,
1101 	unsigned int			namelen,
1102 	const void			*value,
1103 	unsigned int			valuelen,
1104 	void				*priv)
1105 {
1106 	struct xfs_name			xname = {
1107 		.name			= name,
1108 		.len			= namelen,
1109 		.type			= xfs_mode_to_ftype(VFS_I(ip)->i_mode),
1110 	};
1111 	xfs_ino_t			parent_ino;
1112 	uint32_t			parent_gen;
1113 	struct xrep_dir			*rd = priv;
1114 	int				error;
1115 
1116 	if (!(attr_flags & XFS_ATTR_PARENT))
1117 		return 0;
1118 
1119 	/*
1120 	 * Ignore parent pointers that point back to a different dir, list the
1121 	 * wrong generation number, or are invalid.
1122 	 */
1123 	error = xfs_parent_from_attr(sc->mp, attr_flags, name, namelen, value,
1124 			valuelen, &parent_ino, &parent_gen);
1125 	if (error)
1126 		return error;
1127 
1128 	if (parent_ino != sc->ip->i_ino ||
1129 	    parent_gen != VFS_I(sc->ip)->i_generation)
1130 		return 0;
1131 
1132 	mutex_lock(&rd->pscan.lock);
1133 	error = xrep_dir_stash_createname(rd, &xname, ip->i_ino);
1134 	mutex_unlock(&rd->pscan.lock);
1135 	return error;
1136 }
1137 
1138 /*
1139  * If this child dirent points to the directory being repaired, remember that
1140  * fact so that we can reset the dotdot entry if necessary.
1141  */
1142 STATIC int
1143 xrep_dir_scan_dirent(
1144 	struct xfs_scrub	*sc,
1145 	struct xfs_inode	*dp,
1146 	xfs_dir2_dataptr_t	dapos,
1147 	const struct xfs_name	*name,
1148 	xfs_ino_t		ino,
1149 	void			*priv)
1150 {
1151 	struct xrep_dir		*rd = priv;
1152 
1153 	/* Dirent doesn't point to this directory. */
1154 	if (ino != rd->sc->ip->i_ino)
1155 		return 0;
1156 
1157 	/* Ignore garbage inum. */
1158 	if (!xfs_verify_dir_ino(rd->sc->mp, ino))
1159 		return 0;
1160 
1161 	/* No weird looking names. */
1162 	if (name->len >= MAXNAMELEN || name->len <= 0)
1163 		return 0;
1164 
1165 	/* Don't pick up dot or dotdot entries; we only want child dirents. */
1166 	if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
1167 	    xfs_dir2_samename(name, &xfs_name_dot))
1168 		return 0;
1169 
1170 	trace_xrep_dir_stash_createname(sc->tempip, &xfs_name_dotdot,
1171 			dp->i_ino);
1172 
1173 	xrep_findparent_scan_found(&rd->pscan, dp->i_ino);
1174 	return 0;
1175 }
1176 
1177 /*
1178  * Decide if we want to look for child dirents or parent pointers in this file.
1179  * Skip the dir being repaired and any files being used to stage repairs.
1180  */
1181 static inline bool
1182 xrep_dir_want_scan(
1183 	struct xrep_dir		*rd,
1184 	const struct xfs_inode	*ip)
1185 {
1186 	return ip != rd->sc->ip && !xrep_is_tempfile(ip);
1187 }
1188 
1189 /*
1190  * Take ILOCK on a file that we want to scan.
1191  *
1192  * Select ILOCK_EXCL if the file is a directory with an unloaded data bmbt or
1193  * has an unloaded attr bmbt.  Otherwise, take ILOCK_SHARED.
1194  */
1195 static inline unsigned int
1196 xrep_dir_scan_ilock(
1197 	struct xrep_dir		*rd,
1198 	struct xfs_inode	*ip)
1199 {
1200 	uint			lock_mode = XFS_ILOCK_SHARED;
1201 
1202 	/* Need to take the shared ILOCK to advance the iscan cursor. */
1203 	if (!xrep_dir_want_scan(rd, ip))
1204 		goto lock;
1205 
1206 	if (S_ISDIR(VFS_I(ip)->i_mode) && xfs_need_iread_extents(&ip->i_df)) {
1207 		lock_mode = XFS_ILOCK_EXCL;
1208 		goto lock;
1209 	}
1210 
1211 	if (xfs_inode_has_attr_fork(ip) && xfs_need_iread_extents(&ip->i_af))
1212 		lock_mode = XFS_ILOCK_EXCL;
1213 
1214 lock:
1215 	xfs_ilock(ip, lock_mode);
1216 	return lock_mode;
1217 }
1218 
1219 /*
1220  * Scan this file for relevant child dirents or parent pointers that point to
1221  * the directory we're rebuilding.
1222  */
1223 STATIC int
1224 xrep_dir_scan_file(
1225 	struct xrep_dir		*rd,
1226 	struct xfs_inode	*ip)
1227 {
1228 	unsigned int		lock_mode;
1229 	int			error = 0;
1230 
1231 	lock_mode = xrep_dir_scan_ilock(rd, ip);
1232 
1233 	if (!xrep_dir_want_scan(rd, ip))
1234 		goto scan_done;
1235 
1236 	/*
1237 	 * If the extended attributes look as though they has been zapped by
1238 	 * the inode record repair code, we cannot scan for parent pointers.
1239 	 */
1240 	if (xchk_pptr_looks_zapped(ip)) {
1241 		error = -EBUSY;
1242 		goto scan_done;
1243 	}
1244 
1245 	error = xchk_xattr_walk(rd->sc, ip, xrep_dir_scan_pptr, NULL, rd);
1246 	if (error)
1247 		goto scan_done;
1248 
1249 	if (S_ISDIR(VFS_I(ip)->i_mode)) {
1250 		/*
1251 		 * If the directory looks as though it has been zapped by the
1252 		 * inode record repair code, we cannot scan for child dirents.
1253 		 */
1254 		if (xchk_dir_looks_zapped(ip)) {
1255 			error = -EBUSY;
1256 			goto scan_done;
1257 		}
1258 
1259 		error = xchk_dir_walk(rd->sc, ip, xrep_dir_scan_dirent, rd);
1260 		if (error)
1261 			goto scan_done;
1262 	}
1263 
1264 scan_done:
1265 	xchk_iscan_mark_visited(&rd->pscan.iscan, ip);
1266 	xfs_iunlock(ip, lock_mode);
1267 	return error;
1268 }
1269 
1270 /*
1271  * Scan all files in the filesystem for parent pointers that we can turn into
1272  * replacement dirents, and a dirent that we can use to set the dotdot pointer.
1273  */
1274 STATIC int
1275 xrep_dir_scan_dirtree(
1276 	struct xrep_dir		*rd)
1277 {
1278 	struct xfs_scrub	*sc = rd->sc;
1279 	struct xfs_inode	*ip;
1280 	int			error;
1281 
1282 	/* Roots of directory trees are their own parents. */
1283 	if (xchk_inode_is_dirtree_root(sc->ip))
1284 		xrep_findparent_scan_found(&rd->pscan, sc->ip->i_ino);
1285 
1286 	/*
1287 	 * Filesystem scans are time consuming.  Drop the directory ILOCK and
1288 	 * all other resources for the duration of the scan and hope for the
1289 	 * best.  The live update hooks will keep our scan information up to
1290 	 * date even though we've dropped the locks.
1291 	 */
1292 	xchk_trans_cancel(sc);
1293 	if (sc->ilock_flags & (XFS_ILOCK_SHARED | XFS_ILOCK_EXCL))
1294 		xchk_iunlock(sc, sc->ilock_flags & (XFS_ILOCK_SHARED |
1295 						    XFS_ILOCK_EXCL));
1296 	xchk_trans_alloc_empty(sc);
1297 
1298 	while ((error = xchk_iscan_iter(&rd->pscan.iscan, &ip)) == 1) {
1299 		bool		flush;
1300 
1301 		error = xrep_dir_scan_file(rd, ip);
1302 		xchk_irele(sc, ip);
1303 		if (error)
1304 			break;
1305 
1306 		/* Flush stashed dirent updates to constrain memory usage. */
1307 		mutex_lock(&rd->pscan.lock);
1308 		flush = xrep_dir_want_flush_stashed(rd);
1309 		mutex_unlock(&rd->pscan.lock);
1310 		if (flush) {
1311 			xchk_trans_cancel(sc);
1312 
1313 			error = xrep_tempfile_iolock_polled(sc);
1314 			if (error)
1315 				break;
1316 
1317 			error = xrep_dir_replay_updates(rd);
1318 			xrep_tempfile_iounlock(sc);
1319 			if (error)
1320 				break;
1321 
1322 			xchk_trans_alloc_empty(sc);
1323 		}
1324 
1325 		if (xchk_should_terminate(sc, &error))
1326 			break;
1327 	}
1328 	xchk_iscan_iter_finish(&rd->pscan.iscan);
1329 	if (error) {
1330 		/*
1331 		 * If we couldn't grab an inode that was busy with a state
1332 		 * change, change the error code so that we exit to userspace
1333 		 * as quickly as possible.
1334 		 */
1335 		if (error == -EBUSY)
1336 			return -ECANCELED;
1337 		return error;
1338 	}
1339 
1340 	/*
1341 	 * Cancel the empty transaction so that we can (later) use the atomic
1342 	 * file mapping exchange functions to lock files and commit the new
1343 	 * directory.
1344 	 */
1345 	xchk_trans_cancel(rd->sc);
1346 	return 0;
1347 }
1348 
1349 /*
1350  * Capture dirent updates being made by other threads which are relevant to the
1351  * directory being repaired.
1352  */
1353 STATIC int
1354 xrep_dir_live_update(
1355 	struct notifier_block		*nb,
1356 	unsigned long			action,
1357 	void				*data)
1358 {
1359 	struct xfs_dir_update_params	*p = data;
1360 	struct xrep_dir			*rd;
1361 	struct xfs_scrub		*sc;
1362 	int				error = 0;
1363 
1364 	rd = container_of(nb, struct xrep_dir, pscan.dhook.dirent_hook.nb);
1365 	sc = rd->sc;
1366 
1367 	/*
1368 	 * This thread updated a child dirent in the directory that we're
1369 	 * rebuilding.  Stash the update for replay against the temporary
1370 	 * directory.
1371 	 */
1372 	if (p->dp->i_ino == sc->ip->i_ino &&
1373 	    xchk_iscan_want_live_update(&rd->pscan.iscan, p->ip->i_ino)) {
1374 		mutex_lock(&rd->pscan.lock);
1375 		if (p->delta > 0)
1376 			error = xrep_dir_stash_createname(rd, p->name,
1377 					p->ip->i_ino);
1378 		else
1379 			error = xrep_dir_stash_removename(rd, p->name,
1380 					p->ip->i_ino);
1381 		mutex_unlock(&rd->pscan.lock);
1382 		if (error)
1383 			goto out_abort;
1384 	}
1385 
1386 	/*
1387 	 * This thread updated another directory's child dirent that points to
1388 	 * the directory that we're rebuilding, so remember the new dotdot
1389 	 * target.
1390 	 */
1391 	if (p->ip->i_ino == sc->ip->i_ino &&
1392 	    xchk_iscan_want_live_update(&rd->pscan.iscan, p->dp->i_ino)) {
1393 		if (p->delta > 0) {
1394 			trace_xrep_dir_stash_createname(sc->tempip,
1395 					&xfs_name_dotdot,
1396 					p->dp->i_ino);
1397 
1398 			xrep_findparent_scan_found(&rd->pscan, p->dp->i_ino);
1399 		} else {
1400 			trace_xrep_dir_stash_removename(sc->tempip,
1401 					&xfs_name_dotdot,
1402 					rd->pscan.parent_ino);
1403 
1404 			xrep_findparent_scan_found(&rd->pscan, NULLFSINO);
1405 		}
1406 	}
1407 
1408 	return NOTIFY_DONE;
1409 out_abort:
1410 	xchk_iscan_abort(&rd->pscan.iscan);
1411 	return NOTIFY_DONE;
1412 }
1413 
1414 /*
1415  * Free all the directory blocks and reset the data fork.  The caller must
1416  * join the inode to the transaction.  This function returns with the inode
1417  * joined to a clean scrub transaction.
1418  */
1419 STATIC int
1420 xrep_dir_reset_fork(
1421 	struct xrep_dir		*rd,
1422 	xfs_ino_t		parent_ino)
1423 {
1424 	struct xfs_scrub	*sc = rd->sc;
1425 	struct xfs_ifork	*ifp = xfs_ifork_ptr(sc->tempip, XFS_DATA_FORK);
1426 	int			error;
1427 
1428 	/* Unmap all the directory buffers. */
1429 	if (xfs_ifork_has_extents(ifp)) {
1430 		error = xrep_reap_ifork(sc, sc->tempip, XFS_DATA_FORK);
1431 		if (error)
1432 			return error;
1433 	}
1434 
1435 	trace_xrep_dir_reset_fork(sc->tempip, parent_ino);
1436 
1437 	/* Reset the data fork to an empty data fork. */
1438 	xfs_idestroy_fork(ifp);
1439 	ifp->if_bytes = 0;
1440 	sc->tempip->i_disk_size = 0;
1441 
1442 	/* Reinitialize the short form directory. */
1443 	xrep_dir_init_args(rd, sc->tempip, NULL);
1444 	return xfs_dir2_sf_create(&rd->args, parent_ino);
1445 }
1446 
1447 /*
1448  * Prepare both inodes' directory forks for exchanging mappings.  Promote the
1449  * tempfile from short format to leaf format, and if the file being repaired
1450  * has a short format data fork, turn it into an empty extent list.
1451  */
1452 STATIC int
1453 xrep_dir_swap_prep(
1454 	struct xfs_scrub	*sc,
1455 	bool			temp_local,
1456 	bool			ip_local)
1457 {
1458 	int			error;
1459 
1460 	/*
1461 	 * If the tempfile's directory is in shortform format, convert that to
1462 	 * a single leaf extent so that we can use the atomic mapping exchange.
1463 	 */
1464 	if (temp_local) {
1465 		struct xfs_da_args	args = {
1466 			.dp		= sc->tempip,
1467 			.geo		= sc->mp->m_dir_geo,
1468 			.whichfork	= XFS_DATA_FORK,
1469 			.trans		= sc->tp,
1470 			.total		= 1,
1471 			.owner		= sc->ip->i_ino,
1472 		};
1473 
1474 		error = xfs_dir2_sf_to_block(&args);
1475 		if (error)
1476 			return error;
1477 
1478 		/*
1479 		 * Roll the deferred log items to get us back to a clean
1480 		 * transaction.
1481 		 */
1482 		error = xfs_defer_finish(&sc->tp);
1483 		if (error)
1484 			return error;
1485 	}
1486 
1487 	/*
1488 	 * If the file being repaired had a shortform data fork, convert that
1489 	 * to an empty extent list in preparation for the atomic mapping
1490 	 * exchange.
1491 	 */
1492 	if (ip_local) {
1493 		struct xfs_ifork	*ifp;
1494 
1495 		ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1496 		xfs_idestroy_fork(ifp);
1497 		ifp->if_format = XFS_DINODE_FMT_EXTENTS;
1498 		ifp->if_nextents = 0;
1499 		ifp->if_bytes = 0;
1500 		ifp->if_data = NULL;
1501 		ifp->if_height = 0;
1502 
1503 		xfs_trans_log_inode(sc->tp, sc->ip,
1504 				XFS_ILOG_CORE | XFS_ILOG_DDATA);
1505 	}
1506 
1507 	return 0;
1508 }
1509 
1510 /*
1511  * Replace the inode number of a directory entry.
1512  */
1513 static int
1514 xrep_dir_replace(
1515 	struct xrep_dir		*rd,
1516 	struct xfs_inode	*dp,
1517 	const struct xfs_name	*name,
1518 	xfs_ino_t		inum,
1519 	xfs_extlen_t		total)
1520 {
1521 	struct xfs_scrub	*sc = rd->sc;
1522 	int			error;
1523 
1524 	ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
1525 
1526 	error = xfs_dir_ino_validate(sc->mp, inum);
1527 	if (error)
1528 		return error;
1529 
1530 	xrep_dir_init_args(rd, dp, name);
1531 	rd->args.inumber = inum;
1532 	rd->args.total = total;
1533 	return xfs_dir_replace_args(&rd->args);
1534 }
1535 
1536 /*
1537  * Reset the link count of this directory and adjust the unlinked list pointers
1538  * as needed.
1539  */
1540 STATIC int
1541 xrep_dir_set_nlink(
1542 	struct xrep_dir		*rd)
1543 {
1544 	struct xfs_scrub	*sc = rd->sc;
1545 	struct xfs_inode	*dp = sc->ip;
1546 	struct xfs_perag	*pag;
1547 	unsigned int		new_nlink = min_t(unsigned long long,
1548 						  rd->subdirs + 2,
1549 						  XFS_NLINK_PINNED);
1550 	int			error;
1551 
1552 	/*
1553 	 * The directory is not on the incore unlinked list, which means that
1554 	 * it needs to be reachable via the directory tree.  Update the nlink
1555 	 * with our observed link count.  If the directory has no parent, it
1556 	 * will be moved to the orphanage.
1557 	 */
1558 	if (!xfs_inode_on_unlinked_list(dp))
1559 		goto reset_nlink;
1560 
1561 	/*
1562 	 * The directory is on the unlinked list and we did not find any
1563 	 * dirents.  Set the link count to zero and let the directory
1564 	 * inactivate when the last reference drops.
1565 	 */
1566 	if (rd->dirents == 0) {
1567 		rd->needs_adoption = false;
1568 		new_nlink = 0;
1569 		goto reset_nlink;
1570 	}
1571 
1572 	/*
1573 	 * The directory is on the unlinked list and we found dirents.  This
1574 	 * directory needs to be reachable via the directory tree.  Remove the
1575 	 * dir from the unlinked list and update nlink with the observed link
1576 	 * count.  If the directory has no parent, it will be moved to the
1577 	 * orphanage.
1578 	 */
1579 	pag = xfs_perag_get(sc->mp, XFS_INO_TO_AGNO(sc->mp, dp->i_ino));
1580 	if (!pag) {
1581 		ASSERT(0);
1582 		return -EFSCORRUPTED;
1583 	}
1584 
1585 	error = xfs_iunlink_remove(sc->tp, pag, dp);
1586 	xfs_perag_put(pag);
1587 	if (error)
1588 		return error;
1589 
1590 reset_nlink:
1591 	if (VFS_I(dp)->i_nlink != new_nlink)
1592 		set_nlink(VFS_I(dp), new_nlink);
1593 	return 0;
1594 }
1595 
1596 /*
1597  * Finish replaying stashed dirent updates, allocate a transaction for
1598  * exchanging data fork mappings, and take the ILOCKs of both directories
1599  * before we commit the new directory structure.
1600  */
1601 STATIC int
1602 xrep_dir_finalize_tempdir(
1603 	struct xrep_dir		*rd)
1604 {
1605 	struct xfs_scrub	*sc = rd->sc;
1606 	int			error;
1607 
1608 	if (!xfs_has_parent(sc->mp))
1609 		return xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
1610 
1611 	/*
1612 	 * Repair relies on the ILOCK to quiesce all possible dirent updates.
1613 	 * Replay all queued dirent updates into the tempdir before exchanging
1614 	 * the contents, even if that means dropping the ILOCKs and the
1615 	 * transaction.
1616 	 */
1617 	do {
1618 		error = xrep_dir_replay_updates(rd);
1619 		if (error)
1620 			return error;
1621 
1622 		error = xrep_tempexch_trans_alloc(sc, XFS_DATA_FORK, &rd->tx);
1623 		if (error)
1624 			return error;
1625 
1626 		if (xfarray_length(rd->dir_entries) == 0)
1627 			break;
1628 
1629 		xchk_trans_cancel(sc);
1630 		xrep_tempfile_iunlock_both(sc);
1631 	} while (!xchk_should_terminate(sc, &error));
1632 	return error;
1633 }
1634 
1635 /* Exchange the temporary directory's data fork with the one being repaired. */
1636 STATIC int
1637 xrep_dir_swap(
1638 	struct xrep_dir		*rd)
1639 {
1640 	struct xfs_scrub	*sc = rd->sc;
1641 	xfs_ino_t		ino;
1642 	bool			ip_local, temp_local;
1643 	int			error = 0;
1644 
1645 	/*
1646 	 * If we never found the parent for this directory, temporarily assign
1647 	 * the root dir as the parent; we'll move this to the orphanage after
1648 	 * exchanging the dir contents.  We hold the ILOCK of the dir being
1649 	 * repaired, so we're not worried about racy updates of dotdot.
1650 	 */
1651 	ASSERT(sc->ilock_flags & XFS_ILOCK_EXCL);
1652 	if (rd->pscan.parent_ino == NULLFSINO) {
1653 		rd->needs_adoption = true;
1654 		rd->pscan.parent_ino = rd->sc->mp->m_sb.sb_rootino;
1655 	}
1656 
1657 	/*
1658 	 * Reset the temporary directory's '..' entry to point to the parent
1659 	 * that we found.  The dirent replace code asserts if the dirent
1660 	 * already points at the new inumber, so we look it up here.
1661 	 *
1662 	 * It's also possible that this replacement could also expand a sf
1663 	 * tempdir into block format.
1664 	 */
1665 	error = xchk_dir_lookup(sc, rd->sc->tempip, &xfs_name_dotdot, &ino);
1666 	if (error)
1667 		return error;
1668 
1669 	if (rd->pscan.parent_ino != ino) {
1670 		error = xrep_dir_replace(rd, rd->sc->tempip, &xfs_name_dotdot,
1671 				rd->pscan.parent_ino, rd->tx.req.resblks);
1672 		if (error)
1673 			return error;
1674 	}
1675 
1676 	/*
1677 	 * Changing the dot and dotdot entries could have changed the shape of
1678 	 * the directory, so we recompute these.
1679 	 */
1680 	ip_local = sc->ip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
1681 	temp_local = sc->tempip->i_df.if_format == XFS_DINODE_FMT_LOCAL;
1682 
1683 	/*
1684 	 * If the both files have a local format data fork and the rebuilt
1685 	 * directory data would fit in the repaired file's data fork, copy
1686 	 * the contents from the tempfile and update the directory link count.
1687 	 * We're done now.
1688 	 */
1689 	if (ip_local && temp_local &&
1690 	    sc->tempip->i_disk_size <= xfs_inode_data_fork_size(sc->ip)) {
1691 		xrep_tempfile_copyout_local(sc, XFS_DATA_FORK);
1692 		return xrep_dir_set_nlink(rd);
1693 	}
1694 
1695 	/*
1696 	 * Clean the transaction before we start working on exchanging
1697 	 * directory contents.
1698 	 */
1699 	error = xrep_tempfile_roll_trans(rd->sc);
1700 	if (error)
1701 		return error;
1702 
1703 	/* Otherwise, make sure both data forks are in block-mapping mode. */
1704 	error = xrep_dir_swap_prep(sc, temp_local, ip_local);
1705 	if (error)
1706 		return error;
1707 
1708 	/*
1709 	 * Set nlink of the directory in the same transaction sequence that
1710 	 * (atomically) commits the new directory data.
1711 	 */
1712 	error = xrep_dir_set_nlink(rd);
1713 	if (error)
1714 		return error;
1715 
1716 	return xrep_tempexch_contents(sc, &rd->tx);
1717 }
1718 
1719 /*
1720  * Exchange the new directory contents (which we created in the tempfile) with
1721  * the directory being repaired.
1722  */
1723 STATIC int
1724 xrep_dir_rebuild_tree(
1725 	struct xrep_dir		*rd)
1726 {
1727 	struct xfs_scrub	*sc = rd->sc;
1728 	int			error;
1729 
1730 	trace_xrep_dir_rebuild_tree(sc->ip, rd->pscan.parent_ino);
1731 
1732 	/*
1733 	 * Take the IOLOCK on the temporary file so that we can run dir
1734 	 * operations with the same locks held as we would for a normal file.
1735 	 * We still hold sc->ip's IOLOCK.
1736 	 */
1737 	error = xrep_tempfile_iolock_polled(rd->sc);
1738 	if (error)
1739 		return error;
1740 
1741 	/*
1742 	 * Allocate transaction, lock inodes, and make sure that we've replayed
1743 	 * all the stashed dirent updates to the tempdir.  After this point,
1744 	 * we're ready to exchange data fork mappings.
1745 	 */
1746 	error = xrep_dir_finalize_tempdir(rd);
1747 	if (error)
1748 		return error;
1749 
1750 	if (xchk_iscan_aborted(&rd->pscan.iscan))
1751 		return -ECANCELED;
1752 
1753 	/*
1754 	 * Exchange the tempdir's data fork with the file being repaired.  This
1755 	 * recreates the transaction and re-takes the ILOCK in the scrub
1756 	 * context.
1757 	 */
1758 	error = xrep_dir_swap(rd);
1759 	if (error)
1760 		return error;
1761 
1762 	/*
1763 	 * Release the old directory blocks and reset the data fork of the temp
1764 	 * directory to an empty shortform directory because inactivation does
1765 	 * nothing for directories.
1766 	 */
1767 	error = xrep_dir_reset_fork(rd, sc->mp->m_rootip->i_ino);
1768 	if (error)
1769 		return error;
1770 
1771 	/*
1772 	 * Roll to get a transaction without any inodes joined to it.  Then we
1773 	 * can drop the tempfile's ILOCK and IOLOCK before doing more work on
1774 	 * the scrub target directory.
1775 	 */
1776 	error = xfs_trans_roll(&sc->tp);
1777 	if (error)
1778 		return error;
1779 
1780 	xrep_tempfile_iunlock(sc);
1781 	xrep_tempfile_iounlock(sc);
1782 	return 0;
1783 }
1784 
1785 /* Set up the filesystem scan so we can regenerate directory entries. */
1786 STATIC int
1787 xrep_dir_setup_scan(
1788 	struct xrep_dir		*rd)
1789 {
1790 	struct xfs_scrub	*sc = rd->sc;
1791 	int			error;
1792 
1793 	/* Set up some staging memory for salvaging dirents. */
1794 	error = xfarray_create("directory entries", 0,
1795 			sizeof(struct xrep_dirent), &rd->dir_entries);
1796 	if (error)
1797 		return error;
1798 
1799 	error = xfblob_create("directory entry names", &rd->dir_names);
1800 	if (error)
1801 		goto out_xfarray;
1802 
1803 	if (xfs_has_parent(sc->mp))
1804 		error = __xrep_findparent_scan_start(sc, &rd->pscan,
1805 				xrep_dir_live_update);
1806 	else
1807 		error = xrep_findparent_scan_start(sc, &rd->pscan);
1808 	if (error)
1809 		goto out_xfblob;
1810 
1811 	return 0;
1812 
1813 out_xfblob:
1814 	xfblob_destroy(rd->dir_names);
1815 	rd->dir_names = NULL;
1816 out_xfarray:
1817 	xfarray_destroy(rd->dir_entries);
1818 	rd->dir_entries = NULL;
1819 	return error;
1820 }
1821 
1822 /*
1823  * Move the current file to the orphanage.
1824  *
1825  * Caller must hold IOLOCK_EXCL on @sc->ip, and no other inode locks.  Upon
1826  * successful return, the scrub transaction will have enough extra reservation
1827  * to make the move; it will hold IOLOCK_EXCL and ILOCK_EXCL of @sc->ip and the
1828  * orphanage; and both inodes will be ijoined.
1829  */
1830 STATIC int
1831 xrep_dir_move_to_orphanage(
1832 	struct xrep_dir		*rd)
1833 {
1834 	struct xfs_scrub	*sc = rd->sc;
1835 	xfs_ino_t		orig_parent, new_parent;
1836 	int			error;
1837 
1838 	/*
1839 	 * We are about to drop the ILOCK on sc->ip to lock the orphanage and
1840 	 * prepare for the adoption.  Therefore, look up the old dotdot entry
1841 	 * for sc->ip so that we can compare it after we re-lock sc->ip.
1842 	 */
1843 	error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &orig_parent);
1844 	if (error)
1845 		return error;
1846 
1847 	/*
1848 	 * Drop the ILOCK on the scrub target and commit the transaction.
1849 	 * Adoption computes its own resource requirements and gathers the
1850 	 * necessary components.
1851 	 */
1852 	error = xrep_trans_commit(sc);
1853 	if (error)
1854 		return error;
1855 	xchk_iunlock(sc, XFS_ILOCK_EXCL);
1856 
1857 	/* If we can take the orphanage's iolock then we're ready to move. */
1858 	if (!xrep_orphanage_ilock_nowait(sc, XFS_IOLOCK_EXCL)) {
1859 		xchk_iunlock(sc, sc->ilock_flags);
1860 		error = xrep_orphanage_iolock_two(sc);
1861 		if (error)
1862 			return error;
1863 	}
1864 
1865 	/* Grab transaction and ILOCK the two files. */
1866 	error = xrep_adoption_trans_alloc(sc, &rd->adoption);
1867 	if (error)
1868 		return error;
1869 
1870 	error = xrep_adoption_compute_name(&rd->adoption, &rd->xname);
1871 	if (error)
1872 		return error;
1873 
1874 	/*
1875 	 * Now that we've reacquired the ILOCK on sc->ip, look up the dotdot
1876 	 * entry again.  If the parent changed or the child was unlinked while
1877 	 * the child directory was unlocked, we don't need to move the child to
1878 	 * the orphanage after all.
1879 	 */
1880 	error = xchk_dir_lookup(sc, sc->ip, &xfs_name_dotdot, &new_parent);
1881 	if (error)
1882 		return error;
1883 
1884 	/*
1885 	 * Attach to the orphanage if we still have a linked directory and it
1886 	 * hasn't been moved.
1887 	 */
1888 	if (orig_parent == new_parent && VFS_I(sc->ip)->i_nlink > 0) {
1889 		error = xrep_adoption_move(&rd->adoption);
1890 		if (error)
1891 			return error;
1892 	}
1893 
1894 	/*
1895 	 * Launder the scrub transaction so we can drop the orphanage ILOCK
1896 	 * and IOLOCK.  Return holding the scrub target's ILOCK and IOLOCK.
1897 	 */
1898 	error = xrep_adoption_trans_roll(&rd->adoption);
1899 	if (error)
1900 		return error;
1901 
1902 	xrep_orphanage_iunlock(sc, XFS_ILOCK_EXCL);
1903 	xrep_orphanage_iunlock(sc, XFS_IOLOCK_EXCL);
1904 	return 0;
1905 }
1906 
1907 /*
1908  * Repair the directory metadata.
1909  *
1910  * XXX: Directory entry buffers can be multiple fsblocks in size.  The buffer
1911  * cache in XFS can't handle aliased multiblock buffers, so this might
1912  * misbehave if the directory blocks are crosslinked with other filesystem
1913  * metadata.
1914  *
1915  * XXX: Is it necessary to check the dcache for this directory to make sure
1916  * that we always recreate every cached entry?
1917  */
1918 int
1919 xrep_directory(
1920 	struct xfs_scrub	*sc)
1921 {
1922 	struct xrep_dir		*rd = sc->buf;
1923 	int			error;
1924 
1925 	/* The rmapbt is required to reap the old data fork. */
1926 	if (!xfs_has_rmapbt(sc->mp))
1927 		return -EOPNOTSUPP;
1928 	/* We require atomic file exchange range to rebuild anything. */
1929 	if (!xfs_has_exchange_range(sc->mp))
1930 		return -EOPNOTSUPP;
1931 
1932 	error = xrep_dir_setup_scan(rd);
1933 	if (error)
1934 		return error;
1935 
1936 	if (xfs_has_parent(sc->mp))
1937 		error = xrep_dir_scan_dirtree(rd);
1938 	else
1939 		error = xrep_dir_salvage_entries(rd);
1940 	if (error)
1941 		goto out_teardown;
1942 
1943 	/* Last chance to abort before we start committing fixes. */
1944 	if (xchk_should_terminate(sc, &error))
1945 		goto out_teardown;
1946 
1947 	error = xrep_dir_rebuild_tree(rd);
1948 	if (error)
1949 		goto out_teardown;
1950 
1951 	if (rd->needs_adoption) {
1952 		if (!xrep_orphanage_can_adopt(rd->sc))
1953 			error = -EFSCORRUPTED;
1954 		else
1955 			error = xrep_dir_move_to_orphanage(rd);
1956 		if (error)
1957 			goto out_teardown;
1958 	}
1959 
1960 out_teardown:
1961 	xrep_dir_teardown(sc);
1962 	return error;
1963 }
1964