xref: /linux/fs/xfs/scrub/inode_repair.c (revision 5302a5c8beb21d01b7b8d92cc73b6871bc27d7bf)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_bit.h"
15 #include "xfs_log_format.h"
16 #include "xfs_trans.h"
17 #include "xfs_sb.h"
18 #include "xfs_inode.h"
19 #include "xfs_icache.h"
20 #include "xfs_inode_buf.h"
21 #include "xfs_inode_fork.h"
22 #include "xfs_ialloc.h"
23 #include "xfs_da_format.h"
24 #include "xfs_reflink.h"
25 #include "xfs_alloc.h"
26 #include "xfs_rmap.h"
27 #include "xfs_rmap_btree.h"
28 #include "xfs_bmap.h"
29 #include "xfs_bmap_btree.h"
30 #include "xfs_bmap_util.h"
31 #include "xfs_dir2.h"
32 #include "xfs_dir2_priv.h"
33 #include "xfs_quota_defs.h"
34 #include "xfs_quota.h"
35 #include "xfs_ag.h"
36 #include "xfs_rtbitmap.h"
37 #include "xfs_attr_leaf.h"
38 #include "xfs_log_priv.h"
39 #include "xfs_health.h"
40 #include "xfs_symlink_remote.h"
41 #include "scrub/xfs_scrub.h"
42 #include "scrub/scrub.h"
43 #include "scrub/common.h"
44 #include "scrub/btree.h"
45 #include "scrub/trace.h"
46 #include "scrub/repair.h"
47 #include "scrub/iscan.h"
48 #include "scrub/readdir.h"
49 
50 /*
51  * Inode Record Repair
52  * ===================
53  *
54  * Roughly speaking, inode problems can be classified based on whether or not
55  * they trip the dinode verifiers.  If those trip, then we won't be able to
56  * xfs_iget ourselves the inode.
57  *
58  * Therefore, the xrep_dinode_* functions fix anything that will cause the
59  * inode buffer verifier or the dinode verifier.  The xrep_inode_* functions
60  * fix things on live incore inodes.  The inode repair functions make decisions
61  * with security and usability implications when reviving a file:
62  *
63  * - Files with zero di_mode or a garbage di_mode are converted to regular file
64  *   that only root can read.  This file may not actually contain user data,
65  *   if the file was not previously a regular file.  Setuid and setgid bits
66  *   are cleared.
67  *
68  * - Zero-size directories can be truncated to look empty.  It is necessary to
69  *   run the bmapbtd and directory repair functions to fully rebuild the
70  *   directory.
71  *
72  * - Zero-size symbolic link targets can be truncated to '?'.  It is necessary
73  *   to run the bmapbtd and symlink repair functions to salvage the symlink.
74  *
75  * - Invalid extent size hints will be removed.
76  *
77  * - Quotacheck will be scheduled if we repaired an inode that was so badly
78  *   damaged that the ondisk inode had to be rebuilt.
79  *
80  * - Invalid user, group, or project IDs (aka -1U) will be reset to zero.
81  *   Setuid and setgid bits are cleared.
82  *
83  * - Data and attr forks are reset to extents format with zero extents if the
84  *   fork data is inconsistent.  It is necessary to run the bmapbtd or bmapbta
85  *   repair functions to recover the space mapping.
86  *
87  * - ACLs will not be recovered if the attr fork is zapped or the extended
88  *   attribute structure itself requires salvaging.
89  *
90  * - If the attr fork is zapped, the user and group ids are reset to root and
91  *   the setuid and setgid bits are removed.
92  */
93 
94 /*
95  * All the information we need to repair the ondisk inode if we can't iget the
96  * incore inode.  We don't allocate this buffer unless we're going to perform
97  * a repair to the ondisk inode cluster buffer.
98  */
99 struct xrep_inode {
100 	/* Inode mapping that we saved from the initial lookup attempt. */
101 	struct xfs_imap		imap;
102 
103 	struct xfs_scrub	*sc;
104 
105 	/* Blocks in use on the data device by data extents or bmbt blocks. */
106 	xfs_rfsblock_t		data_blocks;
107 
108 	/* Blocks in use on the rt device. */
109 	xfs_rfsblock_t		rt_blocks;
110 
111 	/* Blocks in use by the attr fork. */
112 	xfs_rfsblock_t		attr_blocks;
113 
114 	/* Number of data device extents for the data fork. */
115 	xfs_extnum_t		data_extents;
116 
117 	/*
118 	 * Number of realtime device extents for the data fork.  If
119 	 * data_extents and rt_extents indicate that the data fork has extents
120 	 * on both devices, we'll just back away slowly.
121 	 */
122 	xfs_extnum_t		rt_extents;
123 
124 	/* Number of (data device) extents for the attr fork. */
125 	xfs_aextnum_t		attr_extents;
126 
127 	/* Sick state to set after zapping parts of the inode. */
128 	unsigned int		ino_sick_mask;
129 
130 	/* Must we remove all access from this file? */
131 	bool			zap_acls;
132 
133 	/* Inode scanner to see if we can find the ftype from dirents */
134 	struct xchk_iscan	ftype_iscan;
135 	uint8_t			alleged_ftype;
136 };
137 
138 /*
139  * Setup function for inode repair.  @imap contains the ondisk inode mapping
140  * information so that we can correct the ondisk inode cluster buffer if
141  * necessary to make iget work.
142  */
143 int
144 xrep_setup_inode(
145 	struct xfs_scrub	*sc,
146 	const struct xfs_imap	*imap)
147 {
148 	struct xrep_inode	*ri;
149 
150 	sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS);
151 	if (!sc->buf)
152 		return -ENOMEM;
153 
154 	ri = sc->buf;
155 	memcpy(&ri->imap, imap, sizeof(struct xfs_imap));
156 	ri->sc = sc;
157 	return 0;
158 }
159 
160 /*
161  * Make sure this ondisk inode can pass the inode buffer verifier.  This is
162  * not the same as the dinode verifier.
163  */
164 STATIC void
165 xrep_dinode_buf_core(
166 	struct xfs_scrub	*sc,
167 	struct xfs_buf		*bp,
168 	unsigned int		ioffset)
169 {
170 	struct xfs_dinode	*dip = xfs_buf_offset(bp, ioffset);
171 	struct xfs_trans	*tp = sc->tp;
172 	struct xfs_mount	*mp = sc->mp;
173 	xfs_agino_t		agino;
174 	bool			crc_ok = false;
175 	bool			magic_ok = false;
176 	bool			unlinked_ok = false;
177 
178 	agino = be32_to_cpu(dip->di_next_unlinked);
179 
180 	if (xfs_verify_agino_or_null(bp->b_pag, agino))
181 		unlinked_ok = true;
182 
183 	if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
184 	    xfs_dinode_good_version(mp, dip->di_version))
185 		magic_ok = true;
186 
187 	if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
188 			XFS_DINODE_CRC_OFF))
189 		crc_ok = true;
190 
191 	if (magic_ok && unlinked_ok && crc_ok)
192 		return;
193 
194 	if (!magic_ok) {
195 		dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
196 		dip->di_version = 3;
197 	}
198 	if (!unlinked_ok)
199 		dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
200 	xfs_dinode_calc_crc(mp, dip);
201 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
202 	xfs_trans_log_buf(tp, bp, ioffset,
203 				  ioffset + sizeof(struct xfs_dinode) - 1);
204 }
205 
206 /* Make sure this inode cluster buffer can pass the inode buffer verifier. */
207 STATIC void
208 xrep_dinode_buf(
209 	struct xfs_scrub	*sc,
210 	struct xfs_buf		*bp)
211 {
212 	struct xfs_mount	*mp = sc->mp;
213 	int			i;
214 	int			ni;
215 
216 	ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
217 	for (i = 0; i < ni; i++)
218 		xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog);
219 }
220 
221 /* Reinitialize things that never change in an inode. */
222 STATIC void
223 xrep_dinode_header(
224 	struct xfs_scrub	*sc,
225 	struct xfs_dinode	*dip)
226 {
227 	trace_xrep_dinode_header(sc, dip);
228 
229 	dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
230 	if (!xfs_dinode_good_version(sc->mp, dip->di_version))
231 		dip->di_version = 3;
232 	dip->di_ino = cpu_to_be64(sc->sm->sm_ino);
233 	uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid);
234 	dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
235 }
236 
237 /*
238  * If this directory entry points to the scrub target inode, then the directory
239  * we're scanning is the parent of the scrub target inode.
240  */
241 STATIC int
242 xrep_dinode_findmode_dirent(
243 	struct xfs_scrub		*sc,
244 	struct xfs_inode		*dp,
245 	xfs_dir2_dataptr_t		dapos,
246 	const struct xfs_name		*name,
247 	xfs_ino_t			ino,
248 	void				*priv)
249 {
250 	struct xrep_inode		*ri = priv;
251 	int				error = 0;
252 
253 	if (xchk_should_terminate(ri->sc, &error))
254 		return error;
255 
256 	if (ino != sc->sm->sm_ino)
257 		return 0;
258 
259 	/* Ignore garbage directory entry names. */
260 	if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
261 		return -EFSCORRUPTED;
262 
263 	/* Don't pick up dot or dotdot entries; we only want child dirents. */
264 	if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
265 	    xfs_dir2_samename(name, &xfs_name_dot))
266 		return 0;
267 
268 	/*
269 	 * Uhoh, more than one parent for this inode and they don't agree on
270 	 * the file type?
271 	 */
272 	if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN &&
273 	    ri->alleged_ftype != name->type) {
274 		trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type,
275 				ri->alleged_ftype);
276 		return -EFSCORRUPTED;
277 	}
278 
279 	/* We found a potential parent; remember the ftype. */
280 	trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type);
281 	ri->alleged_ftype = name->type;
282 	return 0;
283 }
284 
285 /* Try to lock a directory, or wait a jiffy. */
286 static inline int
287 xrep_dinode_ilock_nowait(
288 	struct xfs_inode	*dp,
289 	unsigned int		lock_mode)
290 {
291 	if (xfs_ilock_nowait(dp, lock_mode))
292 		return true;
293 
294 	schedule_timeout_killable(1);
295 	return false;
296 }
297 
298 /*
299  * Try to lock a directory to look for ftype hints.  Since we already hold the
300  * AGI buffer, we cannot block waiting for the ILOCK because rename can take
301  * the ILOCK and then try to lock AGIs.
302  */
303 STATIC int
304 xrep_dinode_trylock_directory(
305 	struct xrep_inode	*ri,
306 	struct xfs_inode	*dp,
307 	unsigned int		*lock_modep)
308 {
309 	unsigned long		deadline = jiffies + msecs_to_jiffies(30000);
310 	unsigned int		lock_mode;
311 	int			error = 0;
312 
313 	do {
314 		if (xchk_should_terminate(ri->sc, &error))
315 			return error;
316 
317 		if (xfs_need_iread_extents(&dp->i_df))
318 			lock_mode = XFS_ILOCK_EXCL;
319 		else
320 			lock_mode = XFS_ILOCK_SHARED;
321 
322 		if (xrep_dinode_ilock_nowait(dp, lock_mode)) {
323 			*lock_modep = lock_mode;
324 			return 0;
325 		}
326 	} while (!time_is_before_jiffies(deadline));
327 	return -EBUSY;
328 }
329 
330 /*
331  * If this is a directory, walk the dirents looking for any that point to the
332  * scrub target inode.
333  */
334 STATIC int
335 xrep_dinode_findmode_walk_directory(
336 	struct xrep_inode	*ri,
337 	struct xfs_inode	*dp)
338 {
339 	struct xfs_scrub	*sc = ri->sc;
340 	unsigned int		lock_mode;
341 	int			error = 0;
342 
343 	/*
344 	 * Scan the directory to see if there it contains an entry pointing to
345 	 * the directory that we are repairing.
346 	 */
347 	error = xrep_dinode_trylock_directory(ri, dp, &lock_mode);
348 	if (error)
349 		return error;
350 
351 	/*
352 	 * If this directory is known to be sick, we cannot scan it reliably
353 	 * and must abort.
354 	 */
355 	if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE |
356 				       XFS_SICK_INO_BMBTD |
357 				       XFS_SICK_INO_DIR)) {
358 		error = -EFSCORRUPTED;
359 		goto out_unlock;
360 	}
361 
362 	/*
363 	 * We cannot complete our parent pointer scan if a directory looks as
364 	 * though it has been zapped by the inode record repair code.
365 	 */
366 	if (xchk_dir_looks_zapped(dp)) {
367 		error = -EBUSY;
368 		goto out_unlock;
369 	}
370 
371 	error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri);
372 	if (error)
373 		goto out_unlock;
374 
375 out_unlock:
376 	xfs_iunlock(dp, lock_mode);
377 	return error;
378 }
379 
380 /*
381  * Try to find the mode of the inode being repaired by looking for directories
382  * that point down to this file.
383  */
384 STATIC int
385 xrep_dinode_find_mode(
386 	struct xrep_inode	*ri,
387 	uint16_t		*mode)
388 {
389 	struct xfs_scrub	*sc = ri->sc;
390 	struct xfs_inode	*dp;
391 	int			error;
392 
393 	/* No ftype means we have no other metadata to consult. */
394 	if (!xfs_has_ftype(sc->mp)) {
395 		*mode = S_IFREG;
396 		return 0;
397 	}
398 
399 	/*
400 	 * Scan all directories for parents that might point down to this
401 	 * inode.  Skip the inode being repaired during the scan since it
402 	 * cannot be its own parent.  Note that we still hold the AGI locked
403 	 * so there's a real possibility that _iscan_iter can return EBUSY.
404 	 */
405 	xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan);
406 	xchk_iscan_set_agi_trylock(&ri->ftype_iscan);
407 	ri->ftype_iscan.skip_ino = sc->sm->sm_ino;
408 	ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN;
409 	while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) {
410 		if (S_ISDIR(VFS_I(dp)->i_mode))
411 			error = xrep_dinode_findmode_walk_directory(ri, dp);
412 		xchk_iscan_mark_visited(&ri->ftype_iscan, dp);
413 		xchk_irele(sc, dp);
414 		if (error < 0)
415 			break;
416 		if (xchk_should_terminate(sc, &error))
417 			break;
418 	}
419 	xchk_iscan_iter_finish(&ri->ftype_iscan);
420 	xchk_iscan_teardown(&ri->ftype_iscan);
421 
422 	if (error == -EBUSY) {
423 		if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) {
424 			/*
425 			 * If we got an EBUSY after finding at least one
426 			 * dirent, that means the scan found an inode on the
427 			 * inactivation list and could not open it.  Accept the
428 			 * alleged ftype and install a new mode below.
429 			 */
430 			error = 0;
431 		} else if (!(sc->flags & XCHK_TRY_HARDER)) {
432 			/*
433 			 * Otherwise, retry the operation one time to see if
434 			 * the reason for the delay is an inode from the same
435 			 * cluster buffer waiting on the inactivation list.
436 			 */
437 			error = -EDEADLOCK;
438 		}
439 	}
440 	if (error)
441 		return error;
442 
443 	/*
444 	 * Convert the discovered ftype into the file mode.  If all else fails,
445 	 * return S_IFREG.
446 	 */
447 	switch (ri->alleged_ftype) {
448 	case XFS_DIR3_FT_DIR:
449 		*mode = S_IFDIR;
450 		break;
451 	case XFS_DIR3_FT_WHT:
452 	case XFS_DIR3_FT_CHRDEV:
453 		*mode = S_IFCHR;
454 		break;
455 	case XFS_DIR3_FT_BLKDEV:
456 		*mode = S_IFBLK;
457 		break;
458 	case XFS_DIR3_FT_FIFO:
459 		*mode = S_IFIFO;
460 		break;
461 	case XFS_DIR3_FT_SOCK:
462 		*mode = S_IFSOCK;
463 		break;
464 	case XFS_DIR3_FT_SYMLINK:
465 		*mode = S_IFLNK;
466 		break;
467 	default:
468 		*mode = S_IFREG;
469 		break;
470 	}
471 	return 0;
472 }
473 
474 /* Turn di_mode into /something/ recognizable.  Returns true if we succeed. */
475 STATIC int
476 xrep_dinode_mode(
477 	struct xrep_inode	*ri,
478 	struct xfs_dinode	*dip)
479 {
480 	struct xfs_scrub	*sc = ri->sc;
481 	uint16_t		mode = be16_to_cpu(dip->di_mode);
482 	int			error;
483 
484 	trace_xrep_dinode_mode(sc, dip);
485 
486 	if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN)
487 		return 0;
488 
489 	/* Try to fix the mode.  If we cannot, then leave everything alone. */
490 	error = xrep_dinode_find_mode(ri, &mode);
491 	switch (error) {
492 	case -EINTR:
493 	case -EBUSY:
494 	case -EDEADLOCK:
495 		/* temporary failure or fatal signal */
496 		return error;
497 	case 0:
498 		/* found mode */
499 		break;
500 	default:
501 		/* some other error, assume S_IFREG */
502 		mode = S_IFREG;
503 		break;
504 	}
505 
506 	/* bad mode, so we set it to a file that only root can read */
507 	dip->di_mode = cpu_to_be16(mode);
508 	dip->di_uid = 0;
509 	dip->di_gid = 0;
510 	ri->zap_acls = true;
511 	return 0;
512 }
513 
514 /* Fix any conflicting flags that the verifiers complain about. */
515 STATIC void
516 xrep_dinode_flags(
517 	struct xfs_scrub	*sc,
518 	struct xfs_dinode	*dip,
519 	bool			isrt)
520 {
521 	struct xfs_mount	*mp = sc->mp;
522 	uint64_t		flags2 = be64_to_cpu(dip->di_flags2);
523 	uint16_t		flags = be16_to_cpu(dip->di_flags);
524 	uint16_t		mode = be16_to_cpu(dip->di_mode);
525 
526 	trace_xrep_dinode_flags(sc, dip);
527 
528 	if (isrt)
529 		flags |= XFS_DIFLAG_REALTIME;
530 	else
531 		flags &= ~XFS_DIFLAG_REALTIME;
532 
533 	/*
534 	 * For regular files on a reflink filesystem, set the REFLINK flag to
535 	 * protect shared extents.  A later stage will actually check those
536 	 * extents and clear the flag if possible.
537 	 */
538 	if (xfs_has_reflink(mp) && S_ISREG(mode))
539 		flags2 |= XFS_DIFLAG2_REFLINK;
540 	else
541 		flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
542 	if (flags & XFS_DIFLAG_REALTIME)
543 		flags2 &= ~XFS_DIFLAG2_REFLINK;
544 	if (!xfs_has_bigtime(mp))
545 		flags2 &= ~XFS_DIFLAG2_BIGTIME;
546 	if (!xfs_has_large_extent_counts(mp))
547 		flags2 &= ~XFS_DIFLAG2_NREXT64;
548 	if (flags2 & XFS_DIFLAG2_NREXT64)
549 		dip->di_nrext64_pad = 0;
550 	else if (dip->di_version >= 3)
551 		dip->di_v3_pad = 0;
552 	dip->di_flags = cpu_to_be16(flags);
553 	dip->di_flags2 = cpu_to_be64(flags2);
554 }
555 
556 /*
557  * Blow out symlink; now it points nowhere.  We don't have to worry about
558  * incore state because this inode is failing the verifiers.
559  */
560 STATIC void
561 xrep_dinode_zap_symlink(
562 	struct xrep_inode	*ri,
563 	struct xfs_dinode	*dip)
564 {
565 	struct xfs_scrub	*sc = ri->sc;
566 	char			*p;
567 
568 	trace_xrep_dinode_zap_symlink(sc, dip);
569 
570 	dip->di_format = XFS_DINODE_FMT_LOCAL;
571 	dip->di_size = cpu_to_be64(1);
572 	p = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
573 	*p = '?';
574 	ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED;
575 }
576 
577 /*
578  * Blow out dir, make the parent point to the root.  In the future repair will
579  * reconstruct this directory for us.  Note that there's no in-core directory
580  * inode because the sf verifier tripped, so we don't have to worry about the
581  * dentry cache.
582  */
583 STATIC void
584 xrep_dinode_zap_dir(
585 	struct xrep_inode	*ri,
586 	struct xfs_dinode	*dip)
587 {
588 	struct xfs_scrub	*sc = ri->sc;
589 	struct xfs_mount	*mp = sc->mp;
590 	struct xfs_dir2_sf_hdr	*sfp;
591 	int			i8count;
592 
593 	trace_xrep_dinode_zap_dir(sc, dip);
594 
595 	dip->di_format = XFS_DINODE_FMT_LOCAL;
596 	i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM;
597 	sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
598 	sfp->count = 0;
599 	sfp->i8count = i8count;
600 	xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino);
601 	dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count));
602 	ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED;
603 }
604 
605 /* Make sure we don't have a garbage file size. */
606 STATIC void
607 xrep_dinode_size(
608 	struct xrep_inode	*ri,
609 	struct xfs_dinode	*dip)
610 {
611 	struct xfs_scrub	*sc = ri->sc;
612 	uint64_t		size = be64_to_cpu(dip->di_size);
613 	uint16_t		mode = be16_to_cpu(dip->di_mode);
614 
615 	trace_xrep_dinode_size(sc, dip);
616 
617 	switch (mode & S_IFMT) {
618 	case S_IFIFO:
619 	case S_IFCHR:
620 	case S_IFBLK:
621 	case S_IFSOCK:
622 		/* di_size can't be nonzero for special files */
623 		dip->di_size = 0;
624 		break;
625 	case S_IFREG:
626 		/* Regular files can't be larger than 2^63-1 bytes. */
627 		dip->di_size = cpu_to_be64(size & ~(1ULL << 63));
628 		break;
629 	case S_IFLNK:
630 		/*
631 		 * Truncate ridiculously oversized symlinks.  If the size is
632 		 * zero, reset it to point to the current directory.  Both of
633 		 * these conditions trigger dinode verifier errors, so there
634 		 * is no in-core state to reset.
635 		 */
636 		if (size > XFS_SYMLINK_MAXLEN)
637 			dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN);
638 		else if (size == 0)
639 			xrep_dinode_zap_symlink(ri, dip);
640 		break;
641 	case S_IFDIR:
642 		/*
643 		 * Directories can't have a size larger than 32G.  If the size
644 		 * is zero, reset it to an empty directory.  Both of these
645 		 * conditions trigger dinode verifier errors, so there is no
646 		 * in-core state to reset.
647 		 */
648 		if (size > XFS_DIR2_SPACE_SIZE)
649 			dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE);
650 		else if (size == 0)
651 			xrep_dinode_zap_dir(ri, dip);
652 		break;
653 	}
654 }
655 
656 /* Fix extent size hints. */
657 STATIC void
658 xrep_dinode_extsize_hints(
659 	struct xfs_scrub	*sc,
660 	struct xfs_dinode	*dip)
661 {
662 	struct xfs_mount	*mp = sc->mp;
663 	uint64_t		flags2 = be64_to_cpu(dip->di_flags2);
664 	uint16_t		flags = be16_to_cpu(dip->di_flags);
665 	uint16_t		mode = be16_to_cpu(dip->di_mode);
666 
667 	xfs_failaddr_t		fa;
668 
669 	trace_xrep_dinode_extsize_hints(sc, dip);
670 
671 	fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize),
672 			mode, flags);
673 	if (fa) {
674 		dip->di_extsize = 0;
675 		dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE |
676 					      XFS_DIFLAG_EXTSZINHERIT);
677 	}
678 
679 	if (dip->di_version < 3)
680 		return;
681 
682 	fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
683 			mode, flags, flags2);
684 	if (fa) {
685 		dip->di_cowextsize = 0;
686 		dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE);
687 	}
688 }
689 
690 /* Count extents and blocks for an inode given an rmap. */
691 STATIC int
692 xrep_dinode_walk_rmap(
693 	struct xfs_btree_cur		*cur,
694 	const struct xfs_rmap_irec	*rec,
695 	void				*priv)
696 {
697 	struct xrep_inode		*ri = priv;
698 	int				error = 0;
699 
700 	if (xchk_should_terminate(ri->sc, &error))
701 		return error;
702 
703 	/* We only care about this inode. */
704 	if (rec->rm_owner != ri->sc->sm->sm_ino)
705 		return 0;
706 
707 	if (rec->rm_flags & XFS_RMAP_ATTR_FORK) {
708 		ri->attr_blocks += rec->rm_blockcount;
709 		if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
710 			ri->attr_extents++;
711 
712 		return 0;
713 	}
714 
715 	ri->data_blocks += rec->rm_blockcount;
716 	if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
717 		ri->data_extents++;
718 
719 	return 0;
720 }
721 
722 /* Count extents and blocks for an inode from all AG rmap data. */
723 STATIC int
724 xrep_dinode_count_ag_rmaps(
725 	struct xrep_inode	*ri,
726 	struct xfs_perag	*pag)
727 {
728 	struct xfs_btree_cur	*cur;
729 	struct xfs_buf		*agf;
730 	int			error;
731 
732 	error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf);
733 	if (error)
734 		return error;
735 
736 	cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag);
737 	error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri);
738 	xfs_btree_del_cursor(cur, error);
739 	xfs_trans_brelse(ri->sc->tp, agf);
740 	return error;
741 }
742 
743 /* Count extents and blocks for a given inode from all rmap data. */
744 STATIC int
745 xrep_dinode_count_rmaps(
746 	struct xrep_inode	*ri)
747 {
748 	struct xfs_perag	*pag;
749 	xfs_agnumber_t		agno;
750 	int			error;
751 
752 	if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp))
753 		return -EOPNOTSUPP;
754 
755 	for_each_perag(ri->sc->mp, agno, pag) {
756 		error = xrep_dinode_count_ag_rmaps(ri, pag);
757 		if (error) {
758 			xfs_perag_rele(pag);
759 			return error;
760 		}
761 	}
762 
763 	/* Can't have extents on both the rt and the data device. */
764 	if (ri->data_extents && ri->rt_extents)
765 		return -EFSCORRUPTED;
766 
767 	trace_xrep_dinode_count_rmaps(ri->sc,
768 			ri->data_blocks, ri->rt_blocks, ri->attr_blocks,
769 			ri->data_extents, ri->rt_extents, ri->attr_extents);
770 	return 0;
771 }
772 
773 /* Return true if this extents-format ifork looks like garbage. */
774 STATIC bool
775 xrep_dinode_bad_extents_fork(
776 	struct xfs_scrub	*sc,
777 	struct xfs_dinode	*dip,
778 	unsigned int		dfork_size,
779 	int			whichfork)
780 {
781 	struct xfs_bmbt_irec	new;
782 	struct xfs_bmbt_rec	*dp;
783 	xfs_extnum_t		nex;
784 	bool			isrt;
785 	unsigned int		i;
786 
787 	nex = xfs_dfork_nextents(dip, whichfork);
788 	if (nex > dfork_size / sizeof(struct xfs_bmbt_rec))
789 		return true;
790 
791 	dp = XFS_DFORK_PTR(dip, whichfork);
792 
793 	isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME);
794 	for (i = 0; i < nex; i++, dp++) {
795 		xfs_failaddr_t	fa;
796 
797 		xfs_bmbt_disk_get_all(dp, &new);
798 		fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork,
799 				&new);
800 		if (fa)
801 			return true;
802 	}
803 
804 	return false;
805 }
806 
807 /* Return true if this btree-format ifork looks like garbage. */
808 STATIC bool
809 xrep_dinode_bad_bmbt_fork(
810 	struct xfs_scrub	*sc,
811 	struct xfs_dinode	*dip,
812 	unsigned int		dfork_size,
813 	int			whichfork)
814 {
815 	struct xfs_bmdr_block	*dfp;
816 	xfs_extnum_t		nex;
817 	unsigned int		i;
818 	unsigned int		dmxr;
819 	unsigned int		nrecs;
820 	unsigned int		level;
821 
822 	nex = xfs_dfork_nextents(dip, whichfork);
823 	if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec))
824 		return true;
825 
826 	if (dfork_size < sizeof(struct xfs_bmdr_block))
827 		return true;
828 
829 	dfp = XFS_DFORK_PTR(dip, whichfork);
830 	nrecs = be16_to_cpu(dfp->bb_numrecs);
831 	level = be16_to_cpu(dfp->bb_level);
832 
833 	if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size)
834 		return true;
835 	if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork))
836 		return true;
837 
838 	dmxr = xfs_bmdr_maxrecs(dfork_size, 0);
839 	for (i = 1; i <= nrecs; i++) {
840 		struct xfs_bmbt_key	*fkp;
841 		xfs_bmbt_ptr_t		*fpp;
842 		xfs_fileoff_t		fileoff;
843 		xfs_fsblock_t		fsbno;
844 
845 		fkp = XFS_BMDR_KEY_ADDR(dfp, i);
846 		fileoff = be64_to_cpu(fkp->br_startoff);
847 		if (!xfs_verify_fileoff(sc->mp, fileoff))
848 			return true;
849 
850 		fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr);
851 		fsbno = be64_to_cpu(*fpp);
852 		if (!xfs_verify_fsbno(sc->mp, fsbno))
853 			return true;
854 	}
855 
856 	return false;
857 }
858 
859 /*
860  * Check the data fork for things that will fail the ifork verifiers or the
861  * ifork formatters.
862  */
863 STATIC bool
864 xrep_dinode_check_dfork(
865 	struct xfs_scrub	*sc,
866 	struct xfs_dinode	*dip,
867 	uint16_t		mode)
868 {
869 	void			*dfork_ptr;
870 	int64_t			data_size;
871 	unsigned int		fmt;
872 	unsigned int		dfork_size;
873 
874 	/*
875 	 * Verifier functions take signed int64_t, so check for bogus negative
876 	 * values first.
877 	 */
878 	data_size = be64_to_cpu(dip->di_size);
879 	if (data_size < 0)
880 		return true;
881 
882 	fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK);
883 	switch (mode & S_IFMT) {
884 	case S_IFIFO:
885 	case S_IFCHR:
886 	case S_IFBLK:
887 	case S_IFSOCK:
888 		if (fmt != XFS_DINODE_FMT_DEV)
889 			return true;
890 		break;
891 	case S_IFREG:
892 		if (fmt == XFS_DINODE_FMT_LOCAL)
893 			return true;
894 		fallthrough;
895 	case S_IFLNK:
896 	case S_IFDIR:
897 		switch (fmt) {
898 		case XFS_DINODE_FMT_LOCAL:
899 		case XFS_DINODE_FMT_EXTENTS:
900 		case XFS_DINODE_FMT_BTREE:
901 			break;
902 		default:
903 			return true;
904 		}
905 		break;
906 	default:
907 		return true;
908 	}
909 
910 	dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK);
911 	dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
912 
913 	switch (fmt) {
914 	case XFS_DINODE_FMT_DEV:
915 		break;
916 	case XFS_DINODE_FMT_LOCAL:
917 		/* dir/symlink structure cannot be larger than the fork */
918 		if (data_size > dfork_size)
919 			return true;
920 		/* directory structure must pass verification. */
921 		if (S_ISDIR(mode) &&
922 		    xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL)
923 			return true;
924 		/* symlink structure must pass verification. */
925 		if (S_ISLNK(mode) &&
926 		    xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL)
927 			return true;
928 		break;
929 	case XFS_DINODE_FMT_EXTENTS:
930 		if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size,
931 				XFS_DATA_FORK))
932 			return true;
933 		break;
934 	case XFS_DINODE_FMT_BTREE:
935 		if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size,
936 				XFS_DATA_FORK))
937 			return true;
938 		break;
939 	default:
940 		return true;
941 	}
942 
943 	return false;
944 }
945 
946 static void
947 xrep_dinode_set_data_nextents(
948 	struct xfs_dinode	*dip,
949 	xfs_extnum_t		nextents)
950 {
951 	if (xfs_dinode_has_large_extent_counts(dip))
952 		dip->di_big_nextents = cpu_to_be64(nextents);
953 	else
954 		dip->di_nextents = cpu_to_be32(nextents);
955 }
956 
957 static void
958 xrep_dinode_set_attr_nextents(
959 	struct xfs_dinode	*dip,
960 	xfs_extnum_t		nextents)
961 {
962 	if (xfs_dinode_has_large_extent_counts(dip))
963 		dip->di_big_anextents = cpu_to_be32(nextents);
964 	else
965 		dip->di_anextents = cpu_to_be16(nextents);
966 }
967 
968 /* Reset the data fork to something sane. */
969 STATIC void
970 xrep_dinode_zap_dfork(
971 	struct xrep_inode	*ri,
972 	struct xfs_dinode	*dip,
973 	uint16_t		mode)
974 {
975 	struct xfs_scrub	*sc = ri->sc;
976 
977 	trace_xrep_dinode_zap_dfork(sc, dip);
978 
979 	ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED;
980 
981 	xrep_dinode_set_data_nextents(dip, 0);
982 	ri->data_blocks = 0;
983 	ri->rt_blocks = 0;
984 
985 	/* Special files always get reset to DEV */
986 	switch (mode & S_IFMT) {
987 	case S_IFIFO:
988 	case S_IFCHR:
989 	case S_IFBLK:
990 	case S_IFSOCK:
991 		dip->di_format = XFS_DINODE_FMT_DEV;
992 		dip->di_size = 0;
993 		return;
994 	}
995 
996 	/*
997 	 * If we have data extents, reset to an empty map and hope the user
998 	 * will run the bmapbtd checker next.
999 	 */
1000 	if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) {
1001 		dip->di_format = XFS_DINODE_FMT_EXTENTS;
1002 		return;
1003 	}
1004 
1005 	/* Otherwise, reset the local format to the minimum. */
1006 	switch (mode & S_IFMT) {
1007 	case S_IFLNK:
1008 		xrep_dinode_zap_symlink(ri, dip);
1009 		break;
1010 	case S_IFDIR:
1011 		xrep_dinode_zap_dir(ri, dip);
1012 		break;
1013 	}
1014 }
1015 
1016 /*
1017  * Check the attr fork for things that will fail the ifork verifiers or the
1018  * ifork formatters.
1019  */
1020 STATIC bool
1021 xrep_dinode_check_afork(
1022 	struct xfs_scrub		*sc,
1023 	struct xfs_dinode		*dip)
1024 {
1025 	struct xfs_attr_sf_hdr		*afork_ptr;
1026 	size_t				attr_size;
1027 	unsigned int			afork_size;
1028 
1029 	if (XFS_DFORK_BOFF(dip) == 0)
1030 		return dip->di_aformat != XFS_DINODE_FMT_EXTENTS ||
1031 		       xfs_dfork_attr_extents(dip) != 0;
1032 
1033 	afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
1034 	afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
1035 
1036 	switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) {
1037 	case XFS_DINODE_FMT_LOCAL:
1038 		/* Fork has to be large enough to extract the xattr size. */
1039 		if (afork_size < sizeof(struct xfs_attr_sf_hdr))
1040 			return true;
1041 
1042 		/* xattr structure cannot be larger than the fork */
1043 		attr_size = be16_to_cpu(afork_ptr->totsize);
1044 		if (attr_size > afork_size)
1045 			return true;
1046 
1047 		/* xattr structure must pass verification. */
1048 		return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL;
1049 	case XFS_DINODE_FMT_EXTENTS:
1050 		if (xrep_dinode_bad_extents_fork(sc, dip, afork_size,
1051 					XFS_ATTR_FORK))
1052 			return true;
1053 		break;
1054 	case XFS_DINODE_FMT_BTREE:
1055 		if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size,
1056 					XFS_ATTR_FORK))
1057 			return true;
1058 		break;
1059 	default:
1060 		return true;
1061 	}
1062 
1063 	return false;
1064 }
1065 
1066 /*
1067  * Reset the attr fork to empty.  Since the attr fork could have contained
1068  * ACLs, make the file readable only by root.
1069  */
1070 STATIC void
1071 xrep_dinode_zap_afork(
1072 	struct xrep_inode	*ri,
1073 	struct xfs_dinode	*dip,
1074 	uint16_t		mode)
1075 {
1076 	struct xfs_scrub	*sc = ri->sc;
1077 
1078 	trace_xrep_dinode_zap_afork(sc, dip);
1079 
1080 	ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED;
1081 
1082 	dip->di_aformat = XFS_DINODE_FMT_EXTENTS;
1083 	xrep_dinode_set_attr_nextents(dip, 0);
1084 	ri->attr_blocks = 0;
1085 
1086 	/*
1087 	 * If the data fork is in btree format, removing the attr fork entirely
1088 	 * might cause verifier failures if the next level down in the bmbt
1089 	 * could now fit in the data fork area.
1090 	 */
1091 	if (dip->di_format != XFS_DINODE_FMT_BTREE)
1092 		dip->di_forkoff = 0;
1093 	dip->di_mode = cpu_to_be16(mode & ~0777);
1094 	dip->di_uid = 0;
1095 	dip->di_gid = 0;
1096 }
1097 
1098 /* Make sure the fork offset is a sensible value. */
1099 STATIC void
1100 xrep_dinode_ensure_forkoff(
1101 	struct xrep_inode	*ri,
1102 	struct xfs_dinode	*dip,
1103 	uint16_t		mode)
1104 {
1105 	struct xfs_bmdr_block	*bmdr;
1106 	struct xfs_scrub	*sc = ri->sc;
1107 	xfs_extnum_t		attr_extents, data_extents;
1108 	size_t			bmdr_minsz = XFS_BMDR_SPACE_CALC(1);
1109 	unsigned int		lit_sz = XFS_LITINO(sc->mp);
1110 	unsigned int		afork_min, dfork_min;
1111 
1112 	trace_xrep_dinode_ensure_forkoff(sc, dip);
1113 
1114 	/*
1115 	 * Before calling this function, xrep_dinode_core ensured that both
1116 	 * forks actually fit inside their respective literal areas.  If this
1117 	 * was not the case, the fork was reset to FMT_EXTENTS with zero
1118 	 * records.  If the rmapbt scan found attr or data fork blocks, this
1119 	 * will be noted in the dinode_stats, and we must leave enough room
1120 	 * for the bmap repair code to reconstruct the mapping structure.
1121 	 *
1122 	 * First, compute the minimum space required for the attr fork.
1123 	 */
1124 	switch (dip->di_aformat) {
1125 	case XFS_DINODE_FMT_LOCAL:
1126 		/*
1127 		 * If we still have a shortform xattr structure at all, that
1128 		 * means the attr fork area was exactly large enough to fit
1129 		 * the sf structure.
1130 		 */
1131 		afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
1132 		break;
1133 	case XFS_DINODE_FMT_EXTENTS:
1134 		attr_extents = xfs_dfork_attr_extents(dip);
1135 		if (attr_extents) {
1136 			/*
1137 			 * We must maintain sufficient space to hold the entire
1138 			 * extent map array in the data fork.  Note that we
1139 			 * previously zapped the fork if it had no chance of
1140 			 * fitting in the inode.
1141 			 */
1142 			afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents;
1143 		} else if (ri->attr_extents > 0) {
1144 			/*
1145 			 * The attr fork thinks it has zero extents, but we
1146 			 * found some xattr extents.  We need to leave enough
1147 			 * empty space here so that the incore attr fork will
1148 			 * get created (and hence trigger the attr fork bmap
1149 			 * repairer).
1150 			 */
1151 			afork_min = bmdr_minsz;
1152 		} else {
1153 			/* No extents on disk or found in rmapbt. */
1154 			afork_min = 0;
1155 		}
1156 		break;
1157 	case XFS_DINODE_FMT_BTREE:
1158 		/* Must have space for btree header and key/pointers. */
1159 		bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
1160 		afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
1161 		break;
1162 	default:
1163 		/* We should never see any other formats. */
1164 		afork_min = 0;
1165 		break;
1166 	}
1167 
1168 	/* Compute the minimum space required for the data fork. */
1169 	switch (dip->di_format) {
1170 	case XFS_DINODE_FMT_DEV:
1171 		dfork_min = sizeof(__be32);
1172 		break;
1173 	case XFS_DINODE_FMT_UUID:
1174 		dfork_min = sizeof(uuid_t);
1175 		break;
1176 	case XFS_DINODE_FMT_LOCAL:
1177 		/*
1178 		 * If we still have a shortform data fork at all, that means
1179 		 * the data fork area was large enough to fit whatever was in
1180 		 * there.
1181 		 */
1182 		dfork_min = be64_to_cpu(dip->di_size);
1183 		break;
1184 	case XFS_DINODE_FMT_EXTENTS:
1185 		data_extents = xfs_dfork_data_extents(dip);
1186 		if (data_extents) {
1187 			/*
1188 			 * We must maintain sufficient space to hold the entire
1189 			 * extent map array in the data fork.  Note that we
1190 			 * previously zapped the fork if it had no chance of
1191 			 * fitting in the inode.
1192 			 */
1193 			dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents;
1194 		} else if (ri->data_extents > 0 || ri->rt_extents > 0) {
1195 			/*
1196 			 * The data fork thinks it has zero extents, but we
1197 			 * found some data extents.  We need to leave enough
1198 			 * empty space here so that the data fork bmap repair
1199 			 * will recover the mappings.
1200 			 */
1201 			dfork_min = bmdr_minsz;
1202 		} else {
1203 			/* No extents on disk or found in rmapbt. */
1204 			dfork_min = 0;
1205 		}
1206 		break;
1207 	case XFS_DINODE_FMT_BTREE:
1208 		/* Must have space for btree header and key/pointers. */
1209 		bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
1210 		dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
1211 		break;
1212 	default:
1213 		dfork_min = 0;
1214 		break;
1215 	}
1216 
1217 	/*
1218 	 * Round all values up to the nearest 8 bytes, because that is the
1219 	 * precision of di_forkoff.
1220 	 */
1221 	afork_min = roundup(afork_min, 8);
1222 	dfork_min = roundup(dfork_min, 8);
1223 	bmdr_minsz = roundup(bmdr_minsz, 8);
1224 
1225 	ASSERT(dfork_min <= lit_sz);
1226 	ASSERT(afork_min <= lit_sz);
1227 
1228 	/*
1229 	 * If the data fork was zapped and we don't have enough space for the
1230 	 * recovery fork, move the attr fork up.
1231 	 */
1232 	if (dip->di_format == XFS_DINODE_FMT_EXTENTS &&
1233 	    xfs_dfork_data_extents(dip) == 0 &&
1234 	    (ri->data_extents > 0 || ri->rt_extents > 0) &&
1235 	    bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) {
1236 		if (bmdr_minsz + afork_min > lit_sz) {
1237 			/*
1238 			 * The attr for and the stub fork we need to recover
1239 			 * the data fork won't both fit.  Zap the attr fork.
1240 			 */
1241 			xrep_dinode_zap_afork(ri, dip, mode);
1242 			afork_min = bmdr_minsz;
1243 		} else {
1244 			void	*before, *after;
1245 
1246 			/* Otherwise, just slide the attr fork up. */
1247 			before = XFS_DFORK_APTR(dip);
1248 			dip->di_forkoff = bmdr_minsz >> 3;
1249 			after = XFS_DFORK_APTR(dip);
1250 			memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp));
1251 		}
1252 	}
1253 
1254 	/*
1255 	 * If the attr fork was zapped and we don't have enough space for the
1256 	 * recovery fork, move the attr fork down.
1257 	 */
1258 	if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS &&
1259 	    xfs_dfork_attr_extents(dip) == 0 &&
1260 	    ri->attr_extents > 0 &&
1261 	    bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) {
1262 		if (dip->di_format == XFS_DINODE_FMT_BTREE) {
1263 			/*
1264 			 * If the data fork is in btree format then we can't
1265 			 * adjust forkoff because that runs the risk of
1266 			 * violating the extents/btree format transition rules.
1267 			 */
1268 		} else if (bmdr_minsz + dfork_min > lit_sz) {
1269 			/*
1270 			 * If we can't move the attr fork, too bad, we lose the
1271 			 * attr fork and leak its blocks.
1272 			 */
1273 			xrep_dinode_zap_afork(ri, dip, mode);
1274 		} else {
1275 			/*
1276 			 * Otherwise, just slide the attr fork down.  The attr
1277 			 * fork is empty, so we don't have any old contents to
1278 			 * move here.
1279 			 */
1280 			dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3;
1281 		}
1282 	}
1283 }
1284 
1285 /*
1286  * Zap the data/attr forks if we spot anything that isn't going to pass the
1287  * ifork verifiers or the ifork formatters, because we need to get the inode
1288  * into good enough shape that the higher level repair functions can run.
1289  */
1290 STATIC void
1291 xrep_dinode_zap_forks(
1292 	struct xrep_inode	*ri,
1293 	struct xfs_dinode	*dip)
1294 {
1295 	struct xfs_scrub	*sc = ri->sc;
1296 	xfs_extnum_t		data_extents;
1297 	xfs_extnum_t		attr_extents;
1298 	xfs_filblks_t		nblocks;
1299 	uint16_t		mode;
1300 	bool			zap_datafork = false;
1301 	bool			zap_attrfork = ri->zap_acls;
1302 
1303 	trace_xrep_dinode_zap_forks(sc, dip);
1304 
1305 	mode = be16_to_cpu(dip->di_mode);
1306 
1307 	data_extents = xfs_dfork_data_extents(dip);
1308 	attr_extents = xfs_dfork_attr_extents(dip);
1309 	nblocks = be64_to_cpu(dip->di_nblocks);
1310 
1311 	/* Inode counters don't make sense? */
1312 	if (data_extents > nblocks)
1313 		zap_datafork = true;
1314 	if (attr_extents > nblocks)
1315 		zap_attrfork = true;
1316 	if (data_extents + attr_extents > nblocks)
1317 		zap_datafork = zap_attrfork = true;
1318 
1319 	if (!zap_datafork)
1320 		zap_datafork = xrep_dinode_check_dfork(sc, dip, mode);
1321 	if (!zap_attrfork)
1322 		zap_attrfork = xrep_dinode_check_afork(sc, dip);
1323 
1324 	/* Zap whatever's bad. */
1325 	if (zap_attrfork)
1326 		xrep_dinode_zap_afork(ri, dip, mode);
1327 	if (zap_datafork)
1328 		xrep_dinode_zap_dfork(ri, dip, mode);
1329 	xrep_dinode_ensure_forkoff(ri, dip, mode);
1330 
1331 	/*
1332 	 * Zero di_nblocks if we don't have any extents at all to satisfy the
1333 	 * buffer verifier.
1334 	 */
1335 	data_extents = xfs_dfork_data_extents(dip);
1336 	attr_extents = xfs_dfork_attr_extents(dip);
1337 	if (data_extents + attr_extents == 0)
1338 		dip->di_nblocks = 0;
1339 }
1340 
1341 /* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */
1342 STATIC int
1343 xrep_dinode_core(
1344 	struct xrep_inode	*ri)
1345 {
1346 	struct xfs_scrub	*sc = ri->sc;
1347 	struct xfs_buf		*bp;
1348 	struct xfs_dinode	*dip;
1349 	xfs_ino_t		ino = sc->sm->sm_ino;
1350 	int			error;
1351 	int			iget_error;
1352 
1353 	/* Figure out what this inode had mapped in both forks. */
1354 	error = xrep_dinode_count_rmaps(ri);
1355 	if (error)
1356 		return error;
1357 
1358 	/* Read the inode cluster buffer. */
1359 	error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
1360 			ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
1361 			NULL);
1362 	if (error)
1363 		return error;
1364 
1365 	/* Make sure we can pass the inode buffer verifier. */
1366 	xrep_dinode_buf(sc, bp);
1367 	bp->b_ops = &xfs_inode_buf_ops;
1368 
1369 	/* Fix everything the verifier will complain about. */
1370 	dip = xfs_buf_offset(bp, ri->imap.im_boffset);
1371 	xrep_dinode_header(sc, dip);
1372 	iget_error = xrep_dinode_mode(ri, dip);
1373 	if (iget_error)
1374 		goto write;
1375 	xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
1376 	xrep_dinode_size(ri, dip);
1377 	xrep_dinode_extsize_hints(sc, dip);
1378 	xrep_dinode_zap_forks(ri, dip);
1379 
1380 write:
1381 	/* Write out the inode. */
1382 	trace_xrep_dinode_fixed(sc, dip);
1383 	xfs_dinode_calc_crc(sc->mp, dip);
1384 	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF);
1385 	xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset,
1386 			ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1);
1387 
1388 	/*
1389 	 * In theory, we've fixed the ondisk inode record enough that we should
1390 	 * be able to load the inode into the cache.  Try to iget that inode
1391 	 * now while we hold the AGI and the inode cluster buffer and take the
1392 	 * IOLOCK so that we can continue with repairs without anyone else
1393 	 * accessing the inode.  If iget fails, we still need to commit the
1394 	 * changes.
1395 	 */
1396 	if (!iget_error)
1397 		iget_error = xchk_iget(sc, ino, &sc->ip);
1398 	if (!iget_error)
1399 		xchk_ilock(sc, XFS_IOLOCK_EXCL);
1400 
1401 	/*
1402 	 * Commit the inode cluster buffer updates and drop the AGI buffer that
1403 	 * we've been holding since scrub setup.  From here on out, repairs
1404 	 * deal only with the cached inode.
1405 	 */
1406 	error = xrep_trans_commit(sc);
1407 	if (error)
1408 		return error;
1409 
1410 	if (iget_error)
1411 		return iget_error;
1412 
1413 	error = xchk_trans_alloc(sc, 0);
1414 	if (error)
1415 		return error;
1416 
1417 	error = xrep_ino_dqattach(sc);
1418 	if (error)
1419 		return error;
1420 
1421 	xchk_ilock(sc, XFS_ILOCK_EXCL);
1422 	if (ri->ino_sick_mask)
1423 		xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask);
1424 	return 0;
1425 }
1426 
1427 /* Fix everything xfs_dinode_verify cares about. */
1428 STATIC int
1429 xrep_dinode_problems(
1430 	struct xrep_inode	*ri)
1431 {
1432 	struct xfs_scrub	*sc = ri->sc;
1433 	int			error;
1434 
1435 	error = xrep_dinode_core(ri);
1436 	if (error)
1437 		return error;
1438 
1439 	/* We had to fix a totally busted inode, schedule quotacheck. */
1440 	if (XFS_IS_UQUOTA_ON(sc->mp))
1441 		xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
1442 	if (XFS_IS_GQUOTA_ON(sc->mp))
1443 		xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
1444 	if (XFS_IS_PQUOTA_ON(sc->mp))
1445 		xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
1446 
1447 	return 0;
1448 }
1449 
1450 /*
1451  * Fix problems that the verifiers don't care about.  In general these are
1452  * errors that don't cause problems elsewhere in the kernel that we can easily
1453  * detect, so we don't check them all that rigorously.
1454  */
1455 
1456 /* Make sure block and extent counts are ok. */
1457 STATIC int
1458 xrep_inode_blockcounts(
1459 	struct xfs_scrub	*sc)
1460 {
1461 	struct xfs_ifork	*ifp;
1462 	xfs_filblks_t		count;
1463 	xfs_filblks_t		acount;
1464 	xfs_extnum_t		nextents;
1465 	int			error;
1466 
1467 	trace_xrep_inode_blockcounts(sc);
1468 
1469 	/* Set data fork counters from the data fork mappings. */
1470 	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
1471 			&nextents, &count);
1472 	if (error)
1473 		return error;
1474 	if (xfs_is_reflink_inode(sc->ip)) {
1475 		/*
1476 		 * data fork blockcount can exceed physical storage if a user
1477 		 * reflinks the same block over and over again.
1478 		 */
1479 		;
1480 	} else if (XFS_IS_REALTIME_INODE(sc->ip)) {
1481 		if (count >= sc->mp->m_sb.sb_rblocks)
1482 			return -EFSCORRUPTED;
1483 	} else {
1484 		if (count >= sc->mp->m_sb.sb_dblocks)
1485 			return -EFSCORRUPTED;
1486 	}
1487 	error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents);
1488 	if (error)
1489 		return error;
1490 	sc->ip->i_df.if_nextents = nextents;
1491 
1492 	/* Set attr fork counters from the attr fork mappings. */
1493 	ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
1494 	if (ifp) {
1495 		error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
1496 				&nextents, &acount);
1497 		if (error)
1498 			return error;
1499 		if (count >= sc->mp->m_sb.sb_dblocks)
1500 			return -EFSCORRUPTED;
1501 		error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK,
1502 				nextents);
1503 		if (error)
1504 			return error;
1505 		ifp->if_nextents = nextents;
1506 	} else {
1507 		acount = 0;
1508 	}
1509 
1510 	sc->ip->i_nblocks = count + acount;
1511 	return 0;
1512 }
1513 
1514 /* Check for invalid uid/gid/prid. */
1515 STATIC void
1516 xrep_inode_ids(
1517 	struct xfs_scrub	*sc)
1518 {
1519 	bool			dirty = false;
1520 
1521 	trace_xrep_inode_ids(sc);
1522 
1523 	if (!uid_valid(VFS_I(sc->ip)->i_uid)) {
1524 		i_uid_write(VFS_I(sc->ip), 0);
1525 		dirty = true;
1526 		if (XFS_IS_UQUOTA_ON(sc->mp))
1527 			xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
1528 	}
1529 
1530 	if (!gid_valid(VFS_I(sc->ip)->i_gid)) {
1531 		i_gid_write(VFS_I(sc->ip), 0);
1532 		dirty = true;
1533 		if (XFS_IS_GQUOTA_ON(sc->mp))
1534 			xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
1535 	}
1536 
1537 	if (sc->ip->i_projid == -1U) {
1538 		sc->ip->i_projid = 0;
1539 		dirty = true;
1540 		if (XFS_IS_PQUOTA_ON(sc->mp))
1541 			xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
1542 	}
1543 
1544 	/* strip setuid/setgid if we touched any of the ids */
1545 	if (dirty)
1546 		VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID);
1547 }
1548 
1549 static inline void
1550 xrep_clamp_timestamp(
1551 	struct xfs_inode	*ip,
1552 	struct timespec64	*ts)
1553 {
1554 	ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC);
1555 	*ts = timestamp_truncate(*ts, VFS_I(ip));
1556 }
1557 
1558 /* Nanosecond counters can't have more than 1 billion. */
1559 STATIC void
1560 xrep_inode_timestamps(
1561 	struct xfs_inode	*ip)
1562 {
1563 	struct timespec64	tstamp;
1564 	struct inode		*inode = VFS_I(ip);
1565 
1566 	tstamp = inode_get_atime(inode);
1567 	xrep_clamp_timestamp(ip, &tstamp);
1568 	inode_set_atime_to_ts(inode, tstamp);
1569 
1570 	tstamp = inode_get_mtime(inode);
1571 	xrep_clamp_timestamp(ip, &tstamp);
1572 	inode_set_mtime_to_ts(inode, tstamp);
1573 
1574 	tstamp = inode_get_ctime(inode);
1575 	xrep_clamp_timestamp(ip, &tstamp);
1576 	inode_set_ctime_to_ts(inode, tstamp);
1577 
1578 	xrep_clamp_timestamp(ip, &ip->i_crtime);
1579 }
1580 
1581 /* Fix inode flags that don't make sense together. */
1582 STATIC void
1583 xrep_inode_flags(
1584 	struct xfs_scrub	*sc)
1585 {
1586 	uint16_t		mode;
1587 
1588 	trace_xrep_inode_flags(sc);
1589 
1590 	mode = VFS_I(sc->ip)->i_mode;
1591 
1592 	/* Clear junk flags */
1593 	if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY)
1594 		sc->ip->i_diflags &= ~XFS_DIFLAG_ANY;
1595 
1596 	/* NEWRTBM only applies to realtime bitmaps */
1597 	if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino)
1598 		sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM;
1599 	else
1600 		sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM;
1601 
1602 	/* These only make sense for directories. */
1603 	if (!S_ISDIR(mode))
1604 		sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT |
1605 					  XFS_DIFLAG_EXTSZINHERIT |
1606 					  XFS_DIFLAG_PROJINHERIT |
1607 					  XFS_DIFLAG_NOSYMLINKS);
1608 
1609 	/* These only make sense for files. */
1610 	if (!S_ISREG(mode))
1611 		sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME |
1612 					  XFS_DIFLAG_EXTSIZE);
1613 
1614 	/* These only make sense for non-rt files. */
1615 	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
1616 		sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM;
1617 
1618 	/* Immutable and append only?  Drop the append. */
1619 	if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) &&
1620 	    (sc->ip->i_diflags & XFS_DIFLAG_APPEND))
1621 		sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND;
1622 
1623 	/* Clear junk flags. */
1624 	if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY)
1625 		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY;
1626 
1627 	/* No reflink flag unless we support it and it's a file. */
1628 	if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode))
1629 		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1630 
1631 	/* DAX only applies to files and dirs. */
1632 	if (!(S_ISREG(mode) || S_ISDIR(mode)))
1633 		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
1634 
1635 	/* No reflink files on the realtime device. */
1636 	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
1637 		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1638 }
1639 
1640 /*
1641  * Fix size problems with block/node format directories.  If we fail to find
1642  * the extent list, just bail out and let the bmapbtd repair functions clean
1643  * up that mess.
1644  */
1645 STATIC void
1646 xrep_inode_blockdir_size(
1647 	struct xfs_scrub	*sc)
1648 {
1649 	struct xfs_iext_cursor	icur;
1650 	struct xfs_bmbt_irec	got;
1651 	struct xfs_ifork	*ifp;
1652 	xfs_fileoff_t		off;
1653 	int			error;
1654 
1655 	trace_xrep_inode_blockdir_size(sc);
1656 
1657 	error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK);
1658 	if (error)
1659 		return;
1660 
1661 	/* Find the last block before 32G; this is the dir size. */
1662 	ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1663 	off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE);
1664 	if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) {
1665 		/* zero-extents directory? */
1666 		return;
1667 	}
1668 
1669 	off = got.br_startoff + got.br_blockcount;
1670 	sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE,
1671 			XFS_FSB_TO_B(sc->mp, off));
1672 }
1673 
1674 /* Fix size problems with short format directories. */
1675 STATIC void
1676 xrep_inode_sfdir_size(
1677 	struct xfs_scrub	*sc)
1678 {
1679 	struct xfs_ifork	*ifp;
1680 
1681 	trace_xrep_inode_sfdir_size(sc);
1682 
1683 	ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1684 	sc->ip->i_disk_size = ifp->if_bytes;
1685 }
1686 
1687 /*
1688  * Fix any irregularities in a directory inode's size now that we can iterate
1689  * extent maps and access other regular inode data.
1690  */
1691 STATIC void
1692 xrep_inode_dir_size(
1693 	struct xfs_scrub	*sc)
1694 {
1695 	trace_xrep_inode_dir_size(sc);
1696 
1697 	switch (sc->ip->i_df.if_format) {
1698 	case XFS_DINODE_FMT_EXTENTS:
1699 	case XFS_DINODE_FMT_BTREE:
1700 		xrep_inode_blockdir_size(sc);
1701 		break;
1702 	case XFS_DINODE_FMT_LOCAL:
1703 		xrep_inode_sfdir_size(sc);
1704 		break;
1705 	}
1706 }
1707 
1708 /* Fix extent size hint problems. */
1709 STATIC void
1710 xrep_inode_extsize(
1711 	struct xfs_scrub	*sc)
1712 {
1713 	/* Fix misaligned extent size hints on a directory. */
1714 	if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
1715 	    (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
1716 	    xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) {
1717 		sc->ip->i_extsize = 0;
1718 		sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT;
1719 	}
1720 }
1721 
1722 /* Fix any irregularities in an inode that the verifiers don't catch. */
1723 STATIC int
1724 xrep_inode_problems(
1725 	struct xfs_scrub	*sc)
1726 {
1727 	int			error;
1728 
1729 	error = xrep_inode_blockcounts(sc);
1730 	if (error)
1731 		return error;
1732 	xrep_inode_timestamps(sc->ip);
1733 	xrep_inode_flags(sc);
1734 	xrep_inode_ids(sc);
1735 	/*
1736 	 * We can now do a better job fixing the size of a directory now that
1737 	 * we can scan the data fork extents than we could in xrep_dinode_size.
1738 	 */
1739 	if (S_ISDIR(VFS_I(sc->ip)->i_mode))
1740 		xrep_inode_dir_size(sc);
1741 	xrep_inode_extsize(sc);
1742 
1743 	trace_xrep_inode_fixed(sc);
1744 	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
1745 	return xrep_roll_trans(sc);
1746 }
1747 
1748 /* Repair an inode's fields. */
1749 int
1750 xrep_inode(
1751 	struct xfs_scrub	*sc)
1752 {
1753 	int			error = 0;
1754 
1755 	/*
1756 	 * No inode?  That means we failed the _iget verifiers.  Repair all
1757 	 * the things that the inode verifiers care about, then retry _iget.
1758 	 */
1759 	if (!sc->ip) {
1760 		struct xrep_inode	*ri = sc->buf;
1761 
1762 		ASSERT(ri != NULL);
1763 
1764 		error = xrep_dinode_problems(ri);
1765 		if (error == -EBUSY) {
1766 			/*
1767 			 * Directory scan to recover inode mode encountered a
1768 			 * busy inode, so we did not continue repairing things.
1769 			 */
1770 			return 0;
1771 		}
1772 		if (error)
1773 			return error;
1774 
1775 		/* By this point we had better have a working incore inode. */
1776 		if (!sc->ip)
1777 			return -EFSCORRUPTED;
1778 	}
1779 
1780 	xfs_trans_ijoin(sc->tp, sc->ip, 0);
1781 
1782 	/* If we found corruption of any kind, try to fix it. */
1783 	if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) ||
1784 	    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) {
1785 		error = xrep_inode_problems(sc);
1786 		if (error)
1787 			return error;
1788 	}
1789 
1790 	/* See if we can clear the reflink flag. */
1791 	if (xfs_is_reflink_inode(sc->ip)) {
1792 		error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
1793 		if (error)
1794 			return error;
1795 	}
1796 
1797 	return xrep_defer_finish(sc);
1798 }
1799