xref: /linux/fs/xfs/scrub/inode_repair.c (revision a095686a2383526d7315197e2419d84ee8470217)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * Copyright (C) 2018-2023 Oracle.  All Rights Reserved.
4  * Author: Darrick J. Wong <djwong@kernel.org>
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_trans_resv.h"
11 #include "xfs_mount.h"
12 #include "xfs_defer.h"
13 #include "xfs_btree.h"
14 #include "xfs_bit.h"
15 #include "xfs_log_format.h"
16 #include "xfs_trans.h"
17 #include "xfs_sb.h"
18 #include "xfs_inode.h"
19 #include "xfs_icache.h"
20 #include "xfs_inode_buf.h"
21 #include "xfs_inode_fork.h"
22 #include "xfs_ialloc.h"
23 #include "xfs_da_format.h"
24 #include "xfs_reflink.h"
25 #include "xfs_alloc.h"
26 #include "xfs_rmap.h"
27 #include "xfs_rmap_btree.h"
28 #include "xfs_bmap.h"
29 #include "xfs_bmap_btree.h"
30 #include "xfs_bmap_util.h"
31 #include "xfs_dir2.h"
32 #include "xfs_dir2_priv.h"
33 #include "xfs_quota_defs.h"
34 #include "xfs_quota.h"
35 #include "xfs_ag.h"
36 #include "xfs_rtbitmap.h"
37 #include "xfs_attr_leaf.h"
38 #include "xfs_log_priv.h"
39 #include "xfs_health.h"
40 #include "scrub/xfs_scrub.h"
41 #include "scrub/scrub.h"
42 #include "scrub/common.h"
43 #include "scrub/btree.h"
44 #include "scrub/trace.h"
45 #include "scrub/repair.h"
46 #include "scrub/iscan.h"
47 #include "scrub/readdir.h"
48 
49 /*
50  * Inode Record Repair
51  * ===================
52  *
53  * Roughly speaking, inode problems can be classified based on whether or not
54  * they trip the dinode verifiers.  If those trip, then we won't be able to
55  * xfs_iget ourselves the inode.
56  *
57  * Therefore, the xrep_dinode_* functions fix anything that will cause the
58  * inode buffer verifier or the dinode verifier.  The xrep_inode_* functions
59  * fix things on live incore inodes.  The inode repair functions make decisions
60  * with security and usability implications when reviving a file:
61  *
62  * - Files with zero di_mode or a garbage di_mode are converted to regular file
63  *   that only root can read.  This file may not actually contain user data,
64  *   if the file was not previously a regular file.  Setuid and setgid bits
65  *   are cleared.
66  *
67  * - Zero-size directories can be truncated to look empty.  It is necessary to
68  *   run the bmapbtd and directory repair functions to fully rebuild the
69  *   directory.
70  *
71  * - Zero-size symbolic link targets can be truncated to '?'.  It is necessary
72  *   to run the bmapbtd and symlink repair functions to salvage the symlink.
73  *
74  * - Invalid extent size hints will be removed.
75  *
76  * - Quotacheck will be scheduled if we repaired an inode that was so badly
77  *   damaged that the ondisk inode had to be rebuilt.
78  *
79  * - Invalid user, group, or project IDs (aka -1U) will be reset to zero.
80  *   Setuid and setgid bits are cleared.
81  *
82  * - Data and attr forks are reset to extents format with zero extents if the
83  *   fork data is inconsistent.  It is necessary to run the bmapbtd or bmapbta
84  *   repair functions to recover the space mapping.
85  *
86  * - ACLs will not be recovered if the attr fork is zapped or the extended
87  *   attribute structure itself requires salvaging.
88  *
89  * - If the attr fork is zapped, the user and group ids are reset to root and
90  *   the setuid and setgid bits are removed.
91  */
92 
93 /*
94  * All the information we need to repair the ondisk inode if we can't iget the
95  * incore inode.  We don't allocate this buffer unless we're going to perform
96  * a repair to the ondisk inode cluster buffer.
97  */
98 struct xrep_inode {
99 	/* Inode mapping that we saved from the initial lookup attempt. */
100 	struct xfs_imap		imap;
101 
102 	struct xfs_scrub	*sc;
103 
104 	/* Blocks in use on the data device by data extents or bmbt blocks. */
105 	xfs_rfsblock_t		data_blocks;
106 
107 	/* Blocks in use on the rt device. */
108 	xfs_rfsblock_t		rt_blocks;
109 
110 	/* Blocks in use by the attr fork. */
111 	xfs_rfsblock_t		attr_blocks;
112 
113 	/* Number of data device extents for the data fork. */
114 	xfs_extnum_t		data_extents;
115 
116 	/*
117 	 * Number of realtime device extents for the data fork.  If
118 	 * data_extents and rt_extents indicate that the data fork has extents
119 	 * on both devices, we'll just back away slowly.
120 	 */
121 	xfs_extnum_t		rt_extents;
122 
123 	/* Number of (data device) extents for the attr fork. */
124 	xfs_aextnum_t		attr_extents;
125 
126 	/* Sick state to set after zapping parts of the inode. */
127 	unsigned int		ino_sick_mask;
128 
129 	/* Must we remove all access from this file? */
130 	bool			zap_acls;
131 
132 	/* Inode scanner to see if we can find the ftype from dirents */
133 	struct xchk_iscan	ftype_iscan;
134 	uint8_t			alleged_ftype;
135 };
136 
137 /*
138  * Setup function for inode repair.  @imap contains the ondisk inode mapping
139  * information so that we can correct the ondisk inode cluster buffer if
140  * necessary to make iget work.
141  */
142 int
143 xrep_setup_inode(
144 	struct xfs_scrub	*sc,
145 	const struct xfs_imap	*imap)
146 {
147 	struct xrep_inode	*ri;
148 
149 	sc->buf = kzalloc(sizeof(struct xrep_inode), XCHK_GFP_FLAGS);
150 	if (!sc->buf)
151 		return -ENOMEM;
152 
153 	ri = sc->buf;
154 	memcpy(&ri->imap, imap, sizeof(struct xfs_imap));
155 	ri->sc = sc;
156 	return 0;
157 }
158 
159 /*
160  * Make sure this ondisk inode can pass the inode buffer verifier.  This is
161  * not the same as the dinode verifier.
162  */
163 STATIC void
164 xrep_dinode_buf_core(
165 	struct xfs_scrub	*sc,
166 	struct xfs_buf		*bp,
167 	unsigned int		ioffset)
168 {
169 	struct xfs_dinode	*dip = xfs_buf_offset(bp, ioffset);
170 	struct xfs_trans	*tp = sc->tp;
171 	struct xfs_mount	*mp = sc->mp;
172 	xfs_agino_t		agino;
173 	bool			crc_ok = false;
174 	bool			magic_ok = false;
175 	bool			unlinked_ok = false;
176 
177 	agino = be32_to_cpu(dip->di_next_unlinked);
178 
179 	if (xfs_verify_agino_or_null(bp->b_pag, agino))
180 		unlinked_ok = true;
181 
182 	if (dip->di_magic == cpu_to_be16(XFS_DINODE_MAGIC) &&
183 	    xfs_dinode_good_version(mp, dip->di_version))
184 		magic_ok = true;
185 
186 	if (xfs_verify_cksum((char *)dip, mp->m_sb.sb_inodesize,
187 			XFS_DINODE_CRC_OFF))
188 		crc_ok = true;
189 
190 	if (magic_ok && unlinked_ok && crc_ok)
191 		return;
192 
193 	if (!magic_ok) {
194 		dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
195 		dip->di_version = 3;
196 	}
197 	if (!unlinked_ok)
198 		dip->di_next_unlinked = cpu_to_be32(NULLAGINO);
199 	xfs_dinode_calc_crc(mp, dip);
200 	xfs_trans_buf_set_type(tp, bp, XFS_BLFT_DINO_BUF);
201 	xfs_trans_log_buf(tp, bp, ioffset,
202 				  ioffset + sizeof(struct xfs_dinode) - 1);
203 }
204 
205 /* Make sure this inode cluster buffer can pass the inode buffer verifier. */
206 STATIC void
207 xrep_dinode_buf(
208 	struct xfs_scrub	*sc,
209 	struct xfs_buf		*bp)
210 {
211 	struct xfs_mount	*mp = sc->mp;
212 	int			i;
213 	int			ni;
214 
215 	ni = XFS_BB_TO_FSB(mp, bp->b_length) * mp->m_sb.sb_inopblock;
216 	for (i = 0; i < ni; i++)
217 		xrep_dinode_buf_core(sc, bp, i << mp->m_sb.sb_inodelog);
218 }
219 
220 /* Reinitialize things that never change in an inode. */
221 STATIC void
222 xrep_dinode_header(
223 	struct xfs_scrub	*sc,
224 	struct xfs_dinode	*dip)
225 {
226 	trace_xrep_dinode_header(sc, dip);
227 
228 	dip->di_magic = cpu_to_be16(XFS_DINODE_MAGIC);
229 	if (!xfs_dinode_good_version(sc->mp, dip->di_version))
230 		dip->di_version = 3;
231 	dip->di_ino = cpu_to_be64(sc->sm->sm_ino);
232 	uuid_copy(&dip->di_uuid, &sc->mp->m_sb.sb_meta_uuid);
233 	dip->di_gen = cpu_to_be32(sc->sm->sm_gen);
234 }
235 
236 /*
237  * If this directory entry points to the scrub target inode, then the directory
238  * we're scanning is the parent of the scrub target inode.
239  */
240 STATIC int
241 xrep_dinode_findmode_dirent(
242 	struct xfs_scrub		*sc,
243 	struct xfs_inode		*dp,
244 	xfs_dir2_dataptr_t		dapos,
245 	const struct xfs_name		*name,
246 	xfs_ino_t			ino,
247 	void				*priv)
248 {
249 	struct xrep_inode		*ri = priv;
250 	int				error = 0;
251 
252 	if (xchk_should_terminate(ri->sc, &error))
253 		return error;
254 
255 	if (ino != sc->sm->sm_ino)
256 		return 0;
257 
258 	/* Ignore garbage directory entry names. */
259 	if (name->len == 0 || !xfs_dir2_namecheck(name->name, name->len))
260 		return -EFSCORRUPTED;
261 
262 	/* Don't pick up dot or dotdot entries; we only want child dirents. */
263 	if (xfs_dir2_samename(name, &xfs_name_dotdot) ||
264 	    xfs_dir2_samename(name, &xfs_name_dot))
265 		return 0;
266 
267 	/*
268 	 * Uhoh, more than one parent for this inode and they don't agree on
269 	 * the file type?
270 	 */
271 	if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN &&
272 	    ri->alleged_ftype != name->type) {
273 		trace_xrep_dinode_findmode_dirent_inval(ri->sc, dp, name->type,
274 				ri->alleged_ftype);
275 		return -EFSCORRUPTED;
276 	}
277 
278 	/* We found a potential parent; remember the ftype. */
279 	trace_xrep_dinode_findmode_dirent(ri->sc, dp, name->type);
280 	ri->alleged_ftype = name->type;
281 	return 0;
282 }
283 
284 /*
285  * If this is a directory, walk the dirents looking for any that point to the
286  * scrub target inode.
287  */
288 STATIC int
289 xrep_dinode_findmode_walk_directory(
290 	struct xrep_inode	*ri,
291 	struct xfs_inode	*dp)
292 {
293 	struct xfs_scrub	*sc = ri->sc;
294 	unsigned int		lock_mode;
295 	int			error = 0;
296 
297 	/*
298 	 * Scan the directory to see if there it contains an entry pointing to
299 	 * the directory that we are repairing.
300 	 */
301 	lock_mode = xfs_ilock_data_map_shared(dp);
302 
303 	/*
304 	 * If this directory is known to be sick, we cannot scan it reliably
305 	 * and must abort.
306 	 */
307 	if (xfs_inode_has_sickness(dp, XFS_SICK_INO_CORE |
308 				       XFS_SICK_INO_BMBTD |
309 				       XFS_SICK_INO_DIR)) {
310 		error = -EFSCORRUPTED;
311 		goto out_unlock;
312 	}
313 
314 	/*
315 	 * We cannot complete our parent pointer scan if a directory looks as
316 	 * though it has been zapped by the inode record repair code.
317 	 */
318 	if (xchk_dir_looks_zapped(dp)) {
319 		error = -EBUSY;
320 		goto out_unlock;
321 	}
322 
323 	error = xchk_dir_walk(sc, dp, xrep_dinode_findmode_dirent, ri);
324 	if (error)
325 		goto out_unlock;
326 
327 out_unlock:
328 	xfs_iunlock(dp, lock_mode);
329 	return error;
330 }
331 
332 /*
333  * Try to find the mode of the inode being repaired by looking for directories
334  * that point down to this file.
335  */
336 STATIC int
337 xrep_dinode_find_mode(
338 	struct xrep_inode	*ri,
339 	uint16_t		*mode)
340 {
341 	struct xfs_scrub	*sc = ri->sc;
342 	struct xfs_inode	*dp;
343 	int			error;
344 
345 	/* No ftype means we have no other metadata to consult. */
346 	if (!xfs_has_ftype(sc->mp)) {
347 		*mode = S_IFREG;
348 		return 0;
349 	}
350 
351 	/*
352 	 * Scan all directories for parents that might point down to this
353 	 * inode.  Skip the inode being repaired during the scan since it
354 	 * cannot be its own parent.  Note that we still hold the AGI locked
355 	 * so there's a real possibility that _iscan_iter can return EBUSY.
356 	 */
357 	xchk_iscan_start(sc, 5000, 100, &ri->ftype_iscan);
358 	ri->ftype_iscan.skip_ino = sc->sm->sm_ino;
359 	ri->alleged_ftype = XFS_DIR3_FT_UNKNOWN;
360 	while ((error = xchk_iscan_iter(&ri->ftype_iscan, &dp)) == 1) {
361 		if (S_ISDIR(VFS_I(dp)->i_mode))
362 			error = xrep_dinode_findmode_walk_directory(ri, dp);
363 		xchk_iscan_mark_visited(&ri->ftype_iscan, dp);
364 		xchk_irele(sc, dp);
365 		if (error < 0)
366 			break;
367 		if (xchk_should_terminate(sc, &error))
368 			break;
369 	}
370 	xchk_iscan_iter_finish(&ri->ftype_iscan);
371 	xchk_iscan_teardown(&ri->ftype_iscan);
372 
373 	if (error == -EBUSY) {
374 		if (ri->alleged_ftype != XFS_DIR3_FT_UNKNOWN) {
375 			/*
376 			 * If we got an EBUSY after finding at least one
377 			 * dirent, that means the scan found an inode on the
378 			 * inactivation list and could not open it.  Accept the
379 			 * alleged ftype and install a new mode below.
380 			 */
381 			error = 0;
382 		} else if (!(sc->flags & XCHK_TRY_HARDER)) {
383 			/*
384 			 * Otherwise, retry the operation one time to see if
385 			 * the reason for the delay is an inode from the same
386 			 * cluster buffer waiting on the inactivation list.
387 			 */
388 			error = -EDEADLOCK;
389 		}
390 	}
391 	if (error)
392 		return error;
393 
394 	/*
395 	 * Convert the discovered ftype into the file mode.  If all else fails,
396 	 * return S_IFREG.
397 	 */
398 	switch (ri->alleged_ftype) {
399 	case XFS_DIR3_FT_DIR:
400 		*mode = S_IFDIR;
401 		break;
402 	case XFS_DIR3_FT_WHT:
403 	case XFS_DIR3_FT_CHRDEV:
404 		*mode = S_IFCHR;
405 		break;
406 	case XFS_DIR3_FT_BLKDEV:
407 		*mode = S_IFBLK;
408 		break;
409 	case XFS_DIR3_FT_FIFO:
410 		*mode = S_IFIFO;
411 		break;
412 	case XFS_DIR3_FT_SOCK:
413 		*mode = S_IFSOCK;
414 		break;
415 	case XFS_DIR3_FT_SYMLINK:
416 		*mode = S_IFLNK;
417 		break;
418 	default:
419 		*mode = S_IFREG;
420 		break;
421 	}
422 	return 0;
423 }
424 
425 /* Turn di_mode into /something/ recognizable.  Returns true if we succeed. */
426 STATIC int
427 xrep_dinode_mode(
428 	struct xrep_inode	*ri,
429 	struct xfs_dinode	*dip)
430 {
431 	struct xfs_scrub	*sc = ri->sc;
432 	uint16_t		mode = be16_to_cpu(dip->di_mode);
433 	int			error;
434 
435 	trace_xrep_dinode_mode(sc, dip);
436 
437 	if (mode == 0 || xfs_mode_to_ftype(mode) != XFS_DIR3_FT_UNKNOWN)
438 		return 0;
439 
440 	/* Try to fix the mode.  If we cannot, then leave everything alone. */
441 	error = xrep_dinode_find_mode(ri, &mode);
442 	switch (error) {
443 	case -EINTR:
444 	case -EBUSY:
445 	case -EDEADLOCK:
446 		/* temporary failure or fatal signal */
447 		return error;
448 	case 0:
449 		/* found mode */
450 		break;
451 	default:
452 		/* some other error, assume S_IFREG */
453 		mode = S_IFREG;
454 		break;
455 	}
456 
457 	/* bad mode, so we set it to a file that only root can read */
458 	dip->di_mode = cpu_to_be16(mode);
459 	dip->di_uid = 0;
460 	dip->di_gid = 0;
461 	ri->zap_acls = true;
462 	return 0;
463 }
464 
465 /* Fix any conflicting flags that the verifiers complain about. */
466 STATIC void
467 xrep_dinode_flags(
468 	struct xfs_scrub	*sc,
469 	struct xfs_dinode	*dip,
470 	bool			isrt)
471 {
472 	struct xfs_mount	*mp = sc->mp;
473 	uint64_t		flags2 = be64_to_cpu(dip->di_flags2);
474 	uint16_t		flags = be16_to_cpu(dip->di_flags);
475 	uint16_t		mode = be16_to_cpu(dip->di_mode);
476 
477 	trace_xrep_dinode_flags(sc, dip);
478 
479 	if (isrt)
480 		flags |= XFS_DIFLAG_REALTIME;
481 	else
482 		flags &= ~XFS_DIFLAG_REALTIME;
483 
484 	/*
485 	 * For regular files on a reflink filesystem, set the REFLINK flag to
486 	 * protect shared extents.  A later stage will actually check those
487 	 * extents and clear the flag if possible.
488 	 */
489 	if (xfs_has_reflink(mp) && S_ISREG(mode))
490 		flags2 |= XFS_DIFLAG2_REFLINK;
491 	else
492 		flags2 &= ~(XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE);
493 	if (flags & XFS_DIFLAG_REALTIME)
494 		flags2 &= ~XFS_DIFLAG2_REFLINK;
495 	if (!xfs_has_bigtime(mp))
496 		flags2 &= ~XFS_DIFLAG2_BIGTIME;
497 	if (!xfs_has_large_extent_counts(mp))
498 		flags2 &= ~XFS_DIFLAG2_NREXT64;
499 	if (flags2 & XFS_DIFLAG2_NREXT64)
500 		dip->di_nrext64_pad = 0;
501 	else if (dip->di_version >= 3)
502 		dip->di_v3_pad = 0;
503 	dip->di_flags = cpu_to_be16(flags);
504 	dip->di_flags2 = cpu_to_be64(flags2);
505 }
506 
507 /*
508  * Blow out symlink; now it points nowhere.  We don't have to worry about
509  * incore state because this inode is failing the verifiers.
510  */
511 STATIC void
512 xrep_dinode_zap_symlink(
513 	struct xrep_inode	*ri,
514 	struct xfs_dinode	*dip)
515 {
516 	struct xfs_scrub	*sc = ri->sc;
517 	char			*p;
518 
519 	trace_xrep_dinode_zap_symlink(sc, dip);
520 
521 	dip->di_format = XFS_DINODE_FMT_LOCAL;
522 	dip->di_size = cpu_to_be64(1);
523 	p = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
524 	*p = '?';
525 	ri->ino_sick_mask |= XFS_SICK_INO_SYMLINK_ZAPPED;
526 }
527 
528 /*
529  * Blow out dir, make the parent point to the root.  In the future repair will
530  * reconstruct this directory for us.  Note that there's no in-core directory
531  * inode because the sf verifier tripped, so we don't have to worry about the
532  * dentry cache.
533  */
534 STATIC void
535 xrep_dinode_zap_dir(
536 	struct xrep_inode	*ri,
537 	struct xfs_dinode	*dip)
538 {
539 	struct xfs_scrub	*sc = ri->sc;
540 	struct xfs_mount	*mp = sc->mp;
541 	struct xfs_dir2_sf_hdr	*sfp;
542 	int			i8count;
543 
544 	trace_xrep_dinode_zap_dir(sc, dip);
545 
546 	dip->di_format = XFS_DINODE_FMT_LOCAL;
547 	i8count = mp->m_sb.sb_rootino > XFS_DIR2_MAX_SHORT_INUM;
548 	sfp = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
549 	sfp->count = 0;
550 	sfp->i8count = i8count;
551 	xfs_dir2_sf_put_parent_ino(sfp, mp->m_sb.sb_rootino);
552 	dip->di_size = cpu_to_be64(xfs_dir2_sf_hdr_size(i8count));
553 	ri->ino_sick_mask |= XFS_SICK_INO_DIR_ZAPPED;
554 }
555 
556 /* Make sure we don't have a garbage file size. */
557 STATIC void
558 xrep_dinode_size(
559 	struct xrep_inode	*ri,
560 	struct xfs_dinode	*dip)
561 {
562 	struct xfs_scrub	*sc = ri->sc;
563 	uint64_t		size = be64_to_cpu(dip->di_size);
564 	uint16_t		mode = be16_to_cpu(dip->di_mode);
565 
566 	trace_xrep_dinode_size(sc, dip);
567 
568 	switch (mode & S_IFMT) {
569 	case S_IFIFO:
570 	case S_IFCHR:
571 	case S_IFBLK:
572 	case S_IFSOCK:
573 		/* di_size can't be nonzero for special files */
574 		dip->di_size = 0;
575 		break;
576 	case S_IFREG:
577 		/* Regular files can't be larger than 2^63-1 bytes. */
578 		dip->di_size = cpu_to_be64(size & ~(1ULL << 63));
579 		break;
580 	case S_IFLNK:
581 		/*
582 		 * Truncate ridiculously oversized symlinks.  If the size is
583 		 * zero, reset it to point to the current directory.  Both of
584 		 * these conditions trigger dinode verifier errors, so there
585 		 * is no in-core state to reset.
586 		 */
587 		if (size > XFS_SYMLINK_MAXLEN)
588 			dip->di_size = cpu_to_be64(XFS_SYMLINK_MAXLEN);
589 		else if (size == 0)
590 			xrep_dinode_zap_symlink(ri, dip);
591 		break;
592 	case S_IFDIR:
593 		/*
594 		 * Directories can't have a size larger than 32G.  If the size
595 		 * is zero, reset it to an empty directory.  Both of these
596 		 * conditions trigger dinode verifier errors, so there is no
597 		 * in-core state to reset.
598 		 */
599 		if (size > XFS_DIR2_SPACE_SIZE)
600 			dip->di_size = cpu_to_be64(XFS_DIR2_SPACE_SIZE);
601 		else if (size == 0)
602 			xrep_dinode_zap_dir(ri, dip);
603 		break;
604 	}
605 }
606 
607 /* Fix extent size hints. */
608 STATIC void
609 xrep_dinode_extsize_hints(
610 	struct xfs_scrub	*sc,
611 	struct xfs_dinode	*dip)
612 {
613 	struct xfs_mount	*mp = sc->mp;
614 	uint64_t		flags2 = be64_to_cpu(dip->di_flags2);
615 	uint16_t		flags = be16_to_cpu(dip->di_flags);
616 	uint16_t		mode = be16_to_cpu(dip->di_mode);
617 
618 	xfs_failaddr_t		fa;
619 
620 	trace_xrep_dinode_extsize_hints(sc, dip);
621 
622 	fa = xfs_inode_validate_extsize(mp, be32_to_cpu(dip->di_extsize),
623 			mode, flags);
624 	if (fa) {
625 		dip->di_extsize = 0;
626 		dip->di_flags &= ~cpu_to_be16(XFS_DIFLAG_EXTSIZE |
627 					      XFS_DIFLAG_EXTSZINHERIT);
628 	}
629 
630 	if (dip->di_version < 3)
631 		return;
632 
633 	fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize),
634 			mode, flags, flags2);
635 	if (fa) {
636 		dip->di_cowextsize = 0;
637 		dip->di_flags2 &= ~cpu_to_be64(XFS_DIFLAG2_COWEXTSIZE);
638 	}
639 }
640 
641 /* Count extents and blocks for an inode given an rmap. */
642 STATIC int
643 xrep_dinode_walk_rmap(
644 	struct xfs_btree_cur		*cur,
645 	const struct xfs_rmap_irec	*rec,
646 	void				*priv)
647 {
648 	struct xrep_inode		*ri = priv;
649 	int				error = 0;
650 
651 	if (xchk_should_terminate(ri->sc, &error))
652 		return error;
653 
654 	/* We only care about this inode. */
655 	if (rec->rm_owner != ri->sc->sm->sm_ino)
656 		return 0;
657 
658 	if (rec->rm_flags & XFS_RMAP_ATTR_FORK) {
659 		ri->attr_blocks += rec->rm_blockcount;
660 		if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
661 			ri->attr_extents++;
662 
663 		return 0;
664 	}
665 
666 	ri->data_blocks += rec->rm_blockcount;
667 	if (!(rec->rm_flags & XFS_RMAP_BMBT_BLOCK))
668 		ri->data_extents++;
669 
670 	return 0;
671 }
672 
673 /* Count extents and blocks for an inode from all AG rmap data. */
674 STATIC int
675 xrep_dinode_count_ag_rmaps(
676 	struct xrep_inode	*ri,
677 	struct xfs_perag	*pag)
678 {
679 	struct xfs_btree_cur	*cur;
680 	struct xfs_buf		*agf;
681 	int			error;
682 
683 	error = xfs_alloc_read_agf(pag, ri->sc->tp, 0, &agf);
684 	if (error)
685 		return error;
686 
687 	cur = xfs_rmapbt_init_cursor(ri->sc->mp, ri->sc->tp, agf, pag);
688 	error = xfs_rmap_query_all(cur, xrep_dinode_walk_rmap, ri);
689 	xfs_btree_del_cursor(cur, error);
690 	xfs_trans_brelse(ri->sc->tp, agf);
691 	return error;
692 }
693 
694 /* Count extents and blocks for a given inode from all rmap data. */
695 STATIC int
696 xrep_dinode_count_rmaps(
697 	struct xrep_inode	*ri)
698 {
699 	struct xfs_perag	*pag;
700 	xfs_agnumber_t		agno;
701 	int			error;
702 
703 	if (!xfs_has_rmapbt(ri->sc->mp) || xfs_has_realtime(ri->sc->mp))
704 		return -EOPNOTSUPP;
705 
706 	for_each_perag(ri->sc->mp, agno, pag) {
707 		error = xrep_dinode_count_ag_rmaps(ri, pag);
708 		if (error) {
709 			xfs_perag_rele(pag);
710 			return error;
711 		}
712 	}
713 
714 	/* Can't have extents on both the rt and the data device. */
715 	if (ri->data_extents && ri->rt_extents)
716 		return -EFSCORRUPTED;
717 
718 	trace_xrep_dinode_count_rmaps(ri->sc,
719 			ri->data_blocks, ri->rt_blocks, ri->attr_blocks,
720 			ri->data_extents, ri->rt_extents, ri->attr_extents);
721 	return 0;
722 }
723 
724 /* Return true if this extents-format ifork looks like garbage. */
725 STATIC bool
726 xrep_dinode_bad_extents_fork(
727 	struct xfs_scrub	*sc,
728 	struct xfs_dinode	*dip,
729 	unsigned int		dfork_size,
730 	int			whichfork)
731 {
732 	struct xfs_bmbt_irec	new;
733 	struct xfs_bmbt_rec	*dp;
734 	xfs_extnum_t		nex;
735 	bool			isrt;
736 	unsigned int		i;
737 
738 	nex = xfs_dfork_nextents(dip, whichfork);
739 	if (nex > dfork_size / sizeof(struct xfs_bmbt_rec))
740 		return true;
741 
742 	dp = XFS_DFORK_PTR(dip, whichfork);
743 
744 	isrt = dip->di_flags & cpu_to_be16(XFS_DIFLAG_REALTIME);
745 	for (i = 0; i < nex; i++, dp++) {
746 		xfs_failaddr_t	fa;
747 
748 		xfs_bmbt_disk_get_all(dp, &new);
749 		fa = xfs_bmap_validate_extent_raw(sc->mp, isrt, whichfork,
750 				&new);
751 		if (fa)
752 			return true;
753 	}
754 
755 	return false;
756 }
757 
758 /* Return true if this btree-format ifork looks like garbage. */
759 STATIC bool
760 xrep_dinode_bad_bmbt_fork(
761 	struct xfs_scrub	*sc,
762 	struct xfs_dinode	*dip,
763 	unsigned int		dfork_size,
764 	int			whichfork)
765 {
766 	struct xfs_bmdr_block	*dfp;
767 	xfs_extnum_t		nex;
768 	unsigned int		i;
769 	unsigned int		dmxr;
770 	unsigned int		nrecs;
771 	unsigned int		level;
772 
773 	nex = xfs_dfork_nextents(dip, whichfork);
774 	if (nex <= dfork_size / sizeof(struct xfs_bmbt_rec))
775 		return true;
776 
777 	if (dfork_size < sizeof(struct xfs_bmdr_block))
778 		return true;
779 
780 	dfp = XFS_DFORK_PTR(dip, whichfork);
781 	nrecs = be16_to_cpu(dfp->bb_numrecs);
782 	level = be16_to_cpu(dfp->bb_level);
783 
784 	if (nrecs == 0 || XFS_BMDR_SPACE_CALC(nrecs) > dfork_size)
785 		return true;
786 	if (level == 0 || level >= XFS_BM_MAXLEVELS(sc->mp, whichfork))
787 		return true;
788 
789 	dmxr = xfs_bmdr_maxrecs(dfork_size, 0);
790 	for (i = 1; i <= nrecs; i++) {
791 		struct xfs_bmbt_key	*fkp;
792 		xfs_bmbt_ptr_t		*fpp;
793 		xfs_fileoff_t		fileoff;
794 		xfs_fsblock_t		fsbno;
795 
796 		fkp = XFS_BMDR_KEY_ADDR(dfp, i);
797 		fileoff = be64_to_cpu(fkp->br_startoff);
798 		if (!xfs_verify_fileoff(sc->mp, fileoff))
799 			return true;
800 
801 		fpp = XFS_BMDR_PTR_ADDR(dfp, i, dmxr);
802 		fsbno = be64_to_cpu(*fpp);
803 		if (!xfs_verify_fsbno(sc->mp, fsbno))
804 			return true;
805 	}
806 
807 	return false;
808 }
809 
810 /*
811  * Check the data fork for things that will fail the ifork verifiers or the
812  * ifork formatters.
813  */
814 STATIC bool
815 xrep_dinode_check_dfork(
816 	struct xfs_scrub	*sc,
817 	struct xfs_dinode	*dip,
818 	uint16_t		mode)
819 {
820 	void			*dfork_ptr;
821 	int64_t			data_size;
822 	unsigned int		fmt;
823 	unsigned int		dfork_size;
824 
825 	/*
826 	 * Verifier functions take signed int64_t, so check for bogus negative
827 	 * values first.
828 	 */
829 	data_size = be64_to_cpu(dip->di_size);
830 	if (data_size < 0)
831 		return true;
832 
833 	fmt = XFS_DFORK_FORMAT(dip, XFS_DATA_FORK);
834 	switch (mode & S_IFMT) {
835 	case S_IFIFO:
836 	case S_IFCHR:
837 	case S_IFBLK:
838 	case S_IFSOCK:
839 		if (fmt != XFS_DINODE_FMT_DEV)
840 			return true;
841 		break;
842 	case S_IFREG:
843 		if (fmt == XFS_DINODE_FMT_LOCAL)
844 			return true;
845 		fallthrough;
846 	case S_IFLNK:
847 	case S_IFDIR:
848 		switch (fmt) {
849 		case XFS_DINODE_FMT_LOCAL:
850 		case XFS_DINODE_FMT_EXTENTS:
851 		case XFS_DINODE_FMT_BTREE:
852 			break;
853 		default:
854 			return true;
855 		}
856 		break;
857 	default:
858 		return true;
859 	}
860 
861 	dfork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_DATA_FORK);
862 	dfork_ptr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
863 
864 	switch (fmt) {
865 	case XFS_DINODE_FMT_DEV:
866 		break;
867 	case XFS_DINODE_FMT_LOCAL:
868 		/* dir/symlink structure cannot be larger than the fork */
869 		if (data_size > dfork_size)
870 			return true;
871 		/* directory structure must pass verification. */
872 		if (S_ISDIR(mode) &&
873 		    xfs_dir2_sf_verify(sc->mp, dfork_ptr, data_size) != NULL)
874 			return true;
875 		/* symlink structure must pass verification. */
876 		if (S_ISLNK(mode) &&
877 		    xfs_symlink_shortform_verify(dfork_ptr, data_size) != NULL)
878 			return true;
879 		break;
880 	case XFS_DINODE_FMT_EXTENTS:
881 		if (xrep_dinode_bad_extents_fork(sc, dip, dfork_size,
882 				XFS_DATA_FORK))
883 			return true;
884 		break;
885 	case XFS_DINODE_FMT_BTREE:
886 		if (xrep_dinode_bad_bmbt_fork(sc, dip, dfork_size,
887 				XFS_DATA_FORK))
888 			return true;
889 		break;
890 	default:
891 		return true;
892 	}
893 
894 	return false;
895 }
896 
897 static void
898 xrep_dinode_set_data_nextents(
899 	struct xfs_dinode	*dip,
900 	xfs_extnum_t		nextents)
901 {
902 	if (xfs_dinode_has_large_extent_counts(dip))
903 		dip->di_big_nextents = cpu_to_be64(nextents);
904 	else
905 		dip->di_nextents = cpu_to_be32(nextents);
906 }
907 
908 static void
909 xrep_dinode_set_attr_nextents(
910 	struct xfs_dinode	*dip,
911 	xfs_extnum_t		nextents)
912 {
913 	if (xfs_dinode_has_large_extent_counts(dip))
914 		dip->di_big_anextents = cpu_to_be32(nextents);
915 	else
916 		dip->di_anextents = cpu_to_be16(nextents);
917 }
918 
919 /* Reset the data fork to something sane. */
920 STATIC void
921 xrep_dinode_zap_dfork(
922 	struct xrep_inode	*ri,
923 	struct xfs_dinode	*dip,
924 	uint16_t		mode)
925 {
926 	struct xfs_scrub	*sc = ri->sc;
927 
928 	trace_xrep_dinode_zap_dfork(sc, dip);
929 
930 	ri->ino_sick_mask |= XFS_SICK_INO_BMBTD_ZAPPED;
931 
932 	xrep_dinode_set_data_nextents(dip, 0);
933 	ri->data_blocks = 0;
934 	ri->rt_blocks = 0;
935 
936 	/* Special files always get reset to DEV */
937 	switch (mode & S_IFMT) {
938 	case S_IFIFO:
939 	case S_IFCHR:
940 	case S_IFBLK:
941 	case S_IFSOCK:
942 		dip->di_format = XFS_DINODE_FMT_DEV;
943 		dip->di_size = 0;
944 		return;
945 	}
946 
947 	/*
948 	 * If we have data extents, reset to an empty map and hope the user
949 	 * will run the bmapbtd checker next.
950 	 */
951 	if (ri->data_extents || ri->rt_extents || S_ISREG(mode)) {
952 		dip->di_format = XFS_DINODE_FMT_EXTENTS;
953 		return;
954 	}
955 
956 	/* Otherwise, reset the local format to the minimum. */
957 	switch (mode & S_IFMT) {
958 	case S_IFLNK:
959 		xrep_dinode_zap_symlink(ri, dip);
960 		break;
961 	case S_IFDIR:
962 		xrep_dinode_zap_dir(ri, dip);
963 		break;
964 	}
965 }
966 
967 /*
968  * Check the attr fork for things that will fail the ifork verifiers or the
969  * ifork formatters.
970  */
971 STATIC bool
972 xrep_dinode_check_afork(
973 	struct xfs_scrub		*sc,
974 	struct xfs_dinode		*dip)
975 {
976 	struct xfs_attr_sf_hdr		*afork_ptr;
977 	size_t				attr_size;
978 	unsigned int			afork_size;
979 
980 	if (XFS_DFORK_BOFF(dip) == 0)
981 		return dip->di_aformat != XFS_DINODE_FMT_EXTENTS ||
982 		       xfs_dfork_attr_extents(dip) != 0;
983 
984 	afork_size = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
985 	afork_ptr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
986 
987 	switch (XFS_DFORK_FORMAT(dip, XFS_ATTR_FORK)) {
988 	case XFS_DINODE_FMT_LOCAL:
989 		/* Fork has to be large enough to extract the xattr size. */
990 		if (afork_size < sizeof(struct xfs_attr_sf_hdr))
991 			return true;
992 
993 		/* xattr structure cannot be larger than the fork */
994 		attr_size = be16_to_cpu(afork_ptr->totsize);
995 		if (attr_size > afork_size)
996 			return true;
997 
998 		/* xattr structure must pass verification. */
999 		return xfs_attr_shortform_verify(afork_ptr, attr_size) != NULL;
1000 	case XFS_DINODE_FMT_EXTENTS:
1001 		if (xrep_dinode_bad_extents_fork(sc, dip, afork_size,
1002 					XFS_ATTR_FORK))
1003 			return true;
1004 		break;
1005 	case XFS_DINODE_FMT_BTREE:
1006 		if (xrep_dinode_bad_bmbt_fork(sc, dip, afork_size,
1007 					XFS_ATTR_FORK))
1008 			return true;
1009 		break;
1010 	default:
1011 		return true;
1012 	}
1013 
1014 	return false;
1015 }
1016 
1017 /*
1018  * Reset the attr fork to empty.  Since the attr fork could have contained
1019  * ACLs, make the file readable only by root.
1020  */
1021 STATIC void
1022 xrep_dinode_zap_afork(
1023 	struct xrep_inode	*ri,
1024 	struct xfs_dinode	*dip,
1025 	uint16_t		mode)
1026 {
1027 	struct xfs_scrub	*sc = ri->sc;
1028 
1029 	trace_xrep_dinode_zap_afork(sc, dip);
1030 
1031 	ri->ino_sick_mask |= XFS_SICK_INO_BMBTA_ZAPPED;
1032 
1033 	dip->di_aformat = XFS_DINODE_FMT_EXTENTS;
1034 	xrep_dinode_set_attr_nextents(dip, 0);
1035 	ri->attr_blocks = 0;
1036 
1037 	/*
1038 	 * If the data fork is in btree format, removing the attr fork entirely
1039 	 * might cause verifier failures if the next level down in the bmbt
1040 	 * could now fit in the data fork area.
1041 	 */
1042 	if (dip->di_format != XFS_DINODE_FMT_BTREE)
1043 		dip->di_forkoff = 0;
1044 	dip->di_mode = cpu_to_be16(mode & ~0777);
1045 	dip->di_uid = 0;
1046 	dip->di_gid = 0;
1047 }
1048 
1049 /* Make sure the fork offset is a sensible value. */
1050 STATIC void
1051 xrep_dinode_ensure_forkoff(
1052 	struct xrep_inode	*ri,
1053 	struct xfs_dinode	*dip,
1054 	uint16_t		mode)
1055 {
1056 	struct xfs_bmdr_block	*bmdr;
1057 	struct xfs_scrub	*sc = ri->sc;
1058 	xfs_extnum_t		attr_extents, data_extents;
1059 	size_t			bmdr_minsz = XFS_BMDR_SPACE_CALC(1);
1060 	unsigned int		lit_sz = XFS_LITINO(sc->mp);
1061 	unsigned int		afork_min, dfork_min;
1062 
1063 	trace_xrep_dinode_ensure_forkoff(sc, dip);
1064 
1065 	/*
1066 	 * Before calling this function, xrep_dinode_core ensured that both
1067 	 * forks actually fit inside their respective literal areas.  If this
1068 	 * was not the case, the fork was reset to FMT_EXTENTS with zero
1069 	 * records.  If the rmapbt scan found attr or data fork blocks, this
1070 	 * will be noted in the dinode_stats, and we must leave enough room
1071 	 * for the bmap repair code to reconstruct the mapping structure.
1072 	 *
1073 	 * First, compute the minimum space required for the attr fork.
1074 	 */
1075 	switch (dip->di_aformat) {
1076 	case XFS_DINODE_FMT_LOCAL:
1077 		/*
1078 		 * If we still have a shortform xattr structure at all, that
1079 		 * means the attr fork area was exactly large enough to fit
1080 		 * the sf structure.
1081 		 */
1082 		afork_min = XFS_DFORK_SIZE(dip, sc->mp, XFS_ATTR_FORK);
1083 		break;
1084 	case XFS_DINODE_FMT_EXTENTS:
1085 		attr_extents = xfs_dfork_attr_extents(dip);
1086 		if (attr_extents) {
1087 			/*
1088 			 * We must maintain sufficient space to hold the entire
1089 			 * extent map array in the data fork.  Note that we
1090 			 * previously zapped the fork if it had no chance of
1091 			 * fitting in the inode.
1092 			 */
1093 			afork_min = sizeof(struct xfs_bmbt_rec) * attr_extents;
1094 		} else if (ri->attr_extents > 0) {
1095 			/*
1096 			 * The attr fork thinks it has zero extents, but we
1097 			 * found some xattr extents.  We need to leave enough
1098 			 * empty space here so that the incore attr fork will
1099 			 * get created (and hence trigger the attr fork bmap
1100 			 * repairer).
1101 			 */
1102 			afork_min = bmdr_minsz;
1103 		} else {
1104 			/* No extents on disk or found in rmapbt. */
1105 			afork_min = 0;
1106 		}
1107 		break;
1108 	case XFS_DINODE_FMT_BTREE:
1109 		/* Must have space for btree header and key/pointers. */
1110 		bmdr = XFS_DFORK_PTR(dip, XFS_ATTR_FORK);
1111 		afork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
1112 		break;
1113 	default:
1114 		/* We should never see any other formats. */
1115 		afork_min = 0;
1116 		break;
1117 	}
1118 
1119 	/* Compute the minimum space required for the data fork. */
1120 	switch (dip->di_format) {
1121 	case XFS_DINODE_FMT_DEV:
1122 		dfork_min = sizeof(__be32);
1123 		break;
1124 	case XFS_DINODE_FMT_UUID:
1125 		dfork_min = sizeof(uuid_t);
1126 		break;
1127 	case XFS_DINODE_FMT_LOCAL:
1128 		/*
1129 		 * If we still have a shortform data fork at all, that means
1130 		 * the data fork area was large enough to fit whatever was in
1131 		 * there.
1132 		 */
1133 		dfork_min = be64_to_cpu(dip->di_size);
1134 		break;
1135 	case XFS_DINODE_FMT_EXTENTS:
1136 		data_extents = xfs_dfork_data_extents(dip);
1137 		if (data_extents) {
1138 			/*
1139 			 * We must maintain sufficient space to hold the entire
1140 			 * extent map array in the data fork.  Note that we
1141 			 * previously zapped the fork if it had no chance of
1142 			 * fitting in the inode.
1143 			 */
1144 			dfork_min = sizeof(struct xfs_bmbt_rec) * data_extents;
1145 		} else if (ri->data_extents > 0 || ri->rt_extents > 0) {
1146 			/*
1147 			 * The data fork thinks it has zero extents, but we
1148 			 * found some data extents.  We need to leave enough
1149 			 * empty space here so that the data fork bmap repair
1150 			 * will recover the mappings.
1151 			 */
1152 			dfork_min = bmdr_minsz;
1153 		} else {
1154 			/* No extents on disk or found in rmapbt. */
1155 			dfork_min = 0;
1156 		}
1157 		break;
1158 	case XFS_DINODE_FMT_BTREE:
1159 		/* Must have space for btree header and key/pointers. */
1160 		bmdr = XFS_DFORK_PTR(dip, XFS_DATA_FORK);
1161 		dfork_min = XFS_BMAP_BROOT_SPACE(sc->mp, bmdr);
1162 		break;
1163 	default:
1164 		dfork_min = 0;
1165 		break;
1166 	}
1167 
1168 	/*
1169 	 * Round all values up to the nearest 8 bytes, because that is the
1170 	 * precision of di_forkoff.
1171 	 */
1172 	afork_min = roundup(afork_min, 8);
1173 	dfork_min = roundup(dfork_min, 8);
1174 	bmdr_minsz = roundup(bmdr_minsz, 8);
1175 
1176 	ASSERT(dfork_min <= lit_sz);
1177 	ASSERT(afork_min <= lit_sz);
1178 
1179 	/*
1180 	 * If the data fork was zapped and we don't have enough space for the
1181 	 * recovery fork, move the attr fork up.
1182 	 */
1183 	if (dip->di_format == XFS_DINODE_FMT_EXTENTS &&
1184 	    xfs_dfork_data_extents(dip) == 0 &&
1185 	    (ri->data_extents > 0 || ri->rt_extents > 0) &&
1186 	    bmdr_minsz > XFS_DFORK_DSIZE(dip, sc->mp)) {
1187 		if (bmdr_minsz + afork_min > lit_sz) {
1188 			/*
1189 			 * The attr for and the stub fork we need to recover
1190 			 * the data fork won't both fit.  Zap the attr fork.
1191 			 */
1192 			xrep_dinode_zap_afork(ri, dip, mode);
1193 			afork_min = bmdr_minsz;
1194 		} else {
1195 			void	*before, *after;
1196 
1197 			/* Otherwise, just slide the attr fork up. */
1198 			before = XFS_DFORK_APTR(dip);
1199 			dip->di_forkoff = bmdr_minsz >> 3;
1200 			after = XFS_DFORK_APTR(dip);
1201 			memmove(after, before, XFS_DFORK_ASIZE(dip, sc->mp));
1202 		}
1203 	}
1204 
1205 	/*
1206 	 * If the attr fork was zapped and we don't have enough space for the
1207 	 * recovery fork, move the attr fork down.
1208 	 */
1209 	if (dip->di_aformat == XFS_DINODE_FMT_EXTENTS &&
1210 	    xfs_dfork_attr_extents(dip) == 0 &&
1211 	    ri->attr_extents > 0 &&
1212 	    bmdr_minsz > XFS_DFORK_ASIZE(dip, sc->mp)) {
1213 		if (dip->di_format == XFS_DINODE_FMT_BTREE) {
1214 			/*
1215 			 * If the data fork is in btree format then we can't
1216 			 * adjust forkoff because that runs the risk of
1217 			 * violating the extents/btree format transition rules.
1218 			 */
1219 		} else if (bmdr_minsz + dfork_min > lit_sz) {
1220 			/*
1221 			 * If we can't move the attr fork, too bad, we lose the
1222 			 * attr fork and leak its blocks.
1223 			 */
1224 			xrep_dinode_zap_afork(ri, dip, mode);
1225 		} else {
1226 			/*
1227 			 * Otherwise, just slide the attr fork down.  The attr
1228 			 * fork is empty, so we don't have any old contents to
1229 			 * move here.
1230 			 */
1231 			dip->di_forkoff = (lit_sz - bmdr_minsz) >> 3;
1232 		}
1233 	}
1234 }
1235 
1236 /*
1237  * Zap the data/attr forks if we spot anything that isn't going to pass the
1238  * ifork verifiers or the ifork formatters, because we need to get the inode
1239  * into good enough shape that the higher level repair functions can run.
1240  */
1241 STATIC void
1242 xrep_dinode_zap_forks(
1243 	struct xrep_inode	*ri,
1244 	struct xfs_dinode	*dip)
1245 {
1246 	struct xfs_scrub	*sc = ri->sc;
1247 	xfs_extnum_t		data_extents;
1248 	xfs_extnum_t		attr_extents;
1249 	xfs_filblks_t		nblocks;
1250 	uint16_t		mode;
1251 	bool			zap_datafork = false;
1252 	bool			zap_attrfork = ri->zap_acls;
1253 
1254 	trace_xrep_dinode_zap_forks(sc, dip);
1255 
1256 	mode = be16_to_cpu(dip->di_mode);
1257 
1258 	data_extents = xfs_dfork_data_extents(dip);
1259 	attr_extents = xfs_dfork_attr_extents(dip);
1260 	nblocks = be64_to_cpu(dip->di_nblocks);
1261 
1262 	/* Inode counters don't make sense? */
1263 	if (data_extents > nblocks)
1264 		zap_datafork = true;
1265 	if (attr_extents > nblocks)
1266 		zap_attrfork = true;
1267 	if (data_extents + attr_extents > nblocks)
1268 		zap_datafork = zap_attrfork = true;
1269 
1270 	if (!zap_datafork)
1271 		zap_datafork = xrep_dinode_check_dfork(sc, dip, mode);
1272 	if (!zap_attrfork)
1273 		zap_attrfork = xrep_dinode_check_afork(sc, dip);
1274 
1275 	/* Zap whatever's bad. */
1276 	if (zap_attrfork)
1277 		xrep_dinode_zap_afork(ri, dip, mode);
1278 	if (zap_datafork)
1279 		xrep_dinode_zap_dfork(ri, dip, mode);
1280 	xrep_dinode_ensure_forkoff(ri, dip, mode);
1281 
1282 	/*
1283 	 * Zero di_nblocks if we don't have any extents at all to satisfy the
1284 	 * buffer verifier.
1285 	 */
1286 	data_extents = xfs_dfork_data_extents(dip);
1287 	attr_extents = xfs_dfork_attr_extents(dip);
1288 	if (data_extents + attr_extents == 0)
1289 		dip->di_nblocks = 0;
1290 }
1291 
1292 /* Inode didn't pass dinode verifiers, so fix the raw buffer and retry iget. */
1293 STATIC int
1294 xrep_dinode_core(
1295 	struct xrep_inode	*ri)
1296 {
1297 	struct xfs_scrub	*sc = ri->sc;
1298 	struct xfs_buf		*bp;
1299 	struct xfs_dinode	*dip;
1300 	xfs_ino_t		ino = sc->sm->sm_ino;
1301 	int			error;
1302 	int			iget_error;
1303 
1304 	/* Figure out what this inode had mapped in both forks. */
1305 	error = xrep_dinode_count_rmaps(ri);
1306 	if (error)
1307 		return error;
1308 
1309 	/* Read the inode cluster buffer. */
1310 	error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp,
1311 			ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp,
1312 			NULL);
1313 	if (error)
1314 		return error;
1315 
1316 	/* Make sure we can pass the inode buffer verifier. */
1317 	xrep_dinode_buf(sc, bp);
1318 	bp->b_ops = &xfs_inode_buf_ops;
1319 
1320 	/* Fix everything the verifier will complain about. */
1321 	dip = xfs_buf_offset(bp, ri->imap.im_boffset);
1322 	xrep_dinode_header(sc, dip);
1323 	iget_error = xrep_dinode_mode(ri, dip);
1324 	if (iget_error)
1325 		goto write;
1326 	xrep_dinode_flags(sc, dip, ri->rt_extents > 0);
1327 	xrep_dinode_size(ri, dip);
1328 	xrep_dinode_extsize_hints(sc, dip);
1329 	xrep_dinode_zap_forks(ri, dip);
1330 
1331 write:
1332 	/* Write out the inode. */
1333 	trace_xrep_dinode_fixed(sc, dip);
1334 	xfs_dinode_calc_crc(sc->mp, dip);
1335 	xfs_trans_buf_set_type(sc->tp, bp, XFS_BLFT_DINO_BUF);
1336 	xfs_trans_log_buf(sc->tp, bp, ri->imap.im_boffset,
1337 			ri->imap.im_boffset + sc->mp->m_sb.sb_inodesize - 1);
1338 
1339 	/*
1340 	 * In theory, we've fixed the ondisk inode record enough that we should
1341 	 * be able to load the inode into the cache.  Try to iget that inode
1342 	 * now while we hold the AGI and the inode cluster buffer and take the
1343 	 * IOLOCK so that we can continue with repairs without anyone else
1344 	 * accessing the inode.  If iget fails, we still need to commit the
1345 	 * changes.
1346 	 */
1347 	if (!iget_error)
1348 		iget_error = xchk_iget(sc, ino, &sc->ip);
1349 	if (!iget_error)
1350 		xchk_ilock(sc, XFS_IOLOCK_EXCL);
1351 
1352 	/*
1353 	 * Commit the inode cluster buffer updates and drop the AGI buffer that
1354 	 * we've been holding since scrub setup.  From here on out, repairs
1355 	 * deal only with the cached inode.
1356 	 */
1357 	error = xrep_trans_commit(sc);
1358 	if (error)
1359 		return error;
1360 
1361 	if (iget_error)
1362 		return iget_error;
1363 
1364 	error = xchk_trans_alloc(sc, 0);
1365 	if (error)
1366 		return error;
1367 
1368 	error = xrep_ino_dqattach(sc);
1369 	if (error)
1370 		return error;
1371 
1372 	xchk_ilock(sc, XFS_ILOCK_EXCL);
1373 	if (ri->ino_sick_mask)
1374 		xfs_inode_mark_sick(sc->ip, ri->ino_sick_mask);
1375 	return 0;
1376 }
1377 
1378 /* Fix everything xfs_dinode_verify cares about. */
1379 STATIC int
1380 xrep_dinode_problems(
1381 	struct xrep_inode	*ri)
1382 {
1383 	struct xfs_scrub	*sc = ri->sc;
1384 	int			error;
1385 
1386 	error = xrep_dinode_core(ri);
1387 	if (error)
1388 		return error;
1389 
1390 	/* We had to fix a totally busted inode, schedule quotacheck. */
1391 	if (XFS_IS_UQUOTA_ON(sc->mp))
1392 		xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
1393 	if (XFS_IS_GQUOTA_ON(sc->mp))
1394 		xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
1395 	if (XFS_IS_PQUOTA_ON(sc->mp))
1396 		xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
1397 
1398 	return 0;
1399 }
1400 
1401 /*
1402  * Fix problems that the verifiers don't care about.  In general these are
1403  * errors that don't cause problems elsewhere in the kernel that we can easily
1404  * detect, so we don't check them all that rigorously.
1405  */
1406 
1407 /* Make sure block and extent counts are ok. */
1408 STATIC int
1409 xrep_inode_blockcounts(
1410 	struct xfs_scrub	*sc)
1411 {
1412 	struct xfs_ifork	*ifp;
1413 	xfs_filblks_t		count;
1414 	xfs_filblks_t		acount;
1415 	xfs_extnum_t		nextents;
1416 	int			error;
1417 
1418 	trace_xrep_inode_blockcounts(sc);
1419 
1420 	/* Set data fork counters from the data fork mappings. */
1421 	error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_DATA_FORK,
1422 			&nextents, &count);
1423 	if (error)
1424 		return error;
1425 	if (xfs_is_reflink_inode(sc->ip)) {
1426 		/*
1427 		 * data fork blockcount can exceed physical storage if a user
1428 		 * reflinks the same block over and over again.
1429 		 */
1430 		;
1431 	} else if (XFS_IS_REALTIME_INODE(sc->ip)) {
1432 		if (count >= sc->mp->m_sb.sb_rblocks)
1433 			return -EFSCORRUPTED;
1434 	} else {
1435 		if (count >= sc->mp->m_sb.sb_dblocks)
1436 			return -EFSCORRUPTED;
1437 	}
1438 	error = xrep_ino_ensure_extent_count(sc, XFS_DATA_FORK, nextents);
1439 	if (error)
1440 		return error;
1441 	sc->ip->i_df.if_nextents = nextents;
1442 
1443 	/* Set attr fork counters from the attr fork mappings. */
1444 	ifp = xfs_ifork_ptr(sc->ip, XFS_ATTR_FORK);
1445 	if (ifp) {
1446 		error = xfs_bmap_count_blocks(sc->tp, sc->ip, XFS_ATTR_FORK,
1447 				&nextents, &acount);
1448 		if (error)
1449 			return error;
1450 		if (count >= sc->mp->m_sb.sb_dblocks)
1451 			return -EFSCORRUPTED;
1452 		error = xrep_ino_ensure_extent_count(sc, XFS_ATTR_FORK,
1453 				nextents);
1454 		if (error)
1455 			return error;
1456 		ifp->if_nextents = nextents;
1457 	} else {
1458 		acount = 0;
1459 	}
1460 
1461 	sc->ip->i_nblocks = count + acount;
1462 	return 0;
1463 }
1464 
1465 /* Check for invalid uid/gid/prid. */
1466 STATIC void
1467 xrep_inode_ids(
1468 	struct xfs_scrub	*sc)
1469 {
1470 	bool			dirty = false;
1471 
1472 	trace_xrep_inode_ids(sc);
1473 
1474 	if (!uid_valid(VFS_I(sc->ip)->i_uid)) {
1475 		i_uid_write(VFS_I(sc->ip), 0);
1476 		dirty = true;
1477 		if (XFS_IS_UQUOTA_ON(sc->mp))
1478 			xrep_force_quotacheck(sc, XFS_DQTYPE_USER);
1479 	}
1480 
1481 	if (!gid_valid(VFS_I(sc->ip)->i_gid)) {
1482 		i_gid_write(VFS_I(sc->ip), 0);
1483 		dirty = true;
1484 		if (XFS_IS_GQUOTA_ON(sc->mp))
1485 			xrep_force_quotacheck(sc, XFS_DQTYPE_GROUP);
1486 	}
1487 
1488 	if (sc->ip->i_projid == -1U) {
1489 		sc->ip->i_projid = 0;
1490 		dirty = true;
1491 		if (XFS_IS_PQUOTA_ON(sc->mp))
1492 			xrep_force_quotacheck(sc, XFS_DQTYPE_PROJ);
1493 	}
1494 
1495 	/* strip setuid/setgid if we touched any of the ids */
1496 	if (dirty)
1497 		VFS_I(sc->ip)->i_mode &= ~(S_ISUID | S_ISGID);
1498 }
1499 
1500 static inline void
1501 xrep_clamp_timestamp(
1502 	struct xfs_inode	*ip,
1503 	struct timespec64	*ts)
1504 {
1505 	ts->tv_nsec = clamp_t(long, ts->tv_nsec, 0, NSEC_PER_SEC);
1506 	*ts = timestamp_truncate(*ts, VFS_I(ip));
1507 }
1508 
1509 /* Nanosecond counters can't have more than 1 billion. */
1510 STATIC void
1511 xrep_inode_timestamps(
1512 	struct xfs_inode	*ip)
1513 {
1514 	struct timespec64	tstamp;
1515 	struct inode		*inode = VFS_I(ip);
1516 
1517 	tstamp = inode_get_atime(inode);
1518 	xrep_clamp_timestamp(ip, &tstamp);
1519 	inode_set_atime_to_ts(inode, tstamp);
1520 
1521 	tstamp = inode_get_mtime(inode);
1522 	xrep_clamp_timestamp(ip, &tstamp);
1523 	inode_set_mtime_to_ts(inode, tstamp);
1524 
1525 	tstamp = inode_get_ctime(inode);
1526 	xrep_clamp_timestamp(ip, &tstamp);
1527 	inode_set_ctime_to_ts(inode, tstamp);
1528 
1529 	xrep_clamp_timestamp(ip, &ip->i_crtime);
1530 }
1531 
1532 /* Fix inode flags that don't make sense together. */
1533 STATIC void
1534 xrep_inode_flags(
1535 	struct xfs_scrub	*sc)
1536 {
1537 	uint16_t		mode;
1538 
1539 	trace_xrep_inode_flags(sc);
1540 
1541 	mode = VFS_I(sc->ip)->i_mode;
1542 
1543 	/* Clear junk flags */
1544 	if (sc->ip->i_diflags & ~XFS_DIFLAG_ANY)
1545 		sc->ip->i_diflags &= ~XFS_DIFLAG_ANY;
1546 
1547 	/* NEWRTBM only applies to realtime bitmaps */
1548 	if (sc->ip->i_ino == sc->mp->m_sb.sb_rbmino)
1549 		sc->ip->i_diflags |= XFS_DIFLAG_NEWRTBM;
1550 	else
1551 		sc->ip->i_diflags &= ~XFS_DIFLAG_NEWRTBM;
1552 
1553 	/* These only make sense for directories. */
1554 	if (!S_ISDIR(mode))
1555 		sc->ip->i_diflags &= ~(XFS_DIFLAG_RTINHERIT |
1556 					  XFS_DIFLAG_EXTSZINHERIT |
1557 					  XFS_DIFLAG_PROJINHERIT |
1558 					  XFS_DIFLAG_NOSYMLINKS);
1559 
1560 	/* These only make sense for files. */
1561 	if (!S_ISREG(mode))
1562 		sc->ip->i_diflags &= ~(XFS_DIFLAG_REALTIME |
1563 					  XFS_DIFLAG_EXTSIZE);
1564 
1565 	/* These only make sense for non-rt files. */
1566 	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
1567 		sc->ip->i_diflags &= ~XFS_DIFLAG_FILESTREAM;
1568 
1569 	/* Immutable and append only?  Drop the append. */
1570 	if ((sc->ip->i_diflags & XFS_DIFLAG_IMMUTABLE) &&
1571 	    (sc->ip->i_diflags & XFS_DIFLAG_APPEND))
1572 		sc->ip->i_diflags &= ~XFS_DIFLAG_APPEND;
1573 
1574 	/* Clear junk flags. */
1575 	if (sc->ip->i_diflags2 & ~XFS_DIFLAG2_ANY)
1576 		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_ANY;
1577 
1578 	/* No reflink flag unless we support it and it's a file. */
1579 	if (!xfs_has_reflink(sc->mp) || !S_ISREG(mode))
1580 		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1581 
1582 	/* DAX only applies to files and dirs. */
1583 	if (!(S_ISREG(mode) || S_ISDIR(mode)))
1584 		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_DAX;
1585 
1586 	/* No reflink files on the realtime device. */
1587 	if (sc->ip->i_diflags & XFS_DIFLAG_REALTIME)
1588 		sc->ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK;
1589 }
1590 
1591 /*
1592  * Fix size problems with block/node format directories.  If we fail to find
1593  * the extent list, just bail out and let the bmapbtd repair functions clean
1594  * up that mess.
1595  */
1596 STATIC void
1597 xrep_inode_blockdir_size(
1598 	struct xfs_scrub	*sc)
1599 {
1600 	struct xfs_iext_cursor	icur;
1601 	struct xfs_bmbt_irec	got;
1602 	struct xfs_ifork	*ifp;
1603 	xfs_fileoff_t		off;
1604 	int			error;
1605 
1606 	trace_xrep_inode_blockdir_size(sc);
1607 
1608 	error = xfs_iread_extents(sc->tp, sc->ip, XFS_DATA_FORK);
1609 	if (error)
1610 		return;
1611 
1612 	/* Find the last block before 32G; this is the dir size. */
1613 	ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1614 	off = XFS_B_TO_FSB(sc->mp, XFS_DIR2_SPACE_SIZE);
1615 	if (!xfs_iext_lookup_extent_before(sc->ip, ifp, &off, &icur, &got)) {
1616 		/* zero-extents directory? */
1617 		return;
1618 	}
1619 
1620 	off = got.br_startoff + got.br_blockcount;
1621 	sc->ip->i_disk_size = min_t(loff_t, XFS_DIR2_SPACE_SIZE,
1622 			XFS_FSB_TO_B(sc->mp, off));
1623 }
1624 
1625 /* Fix size problems with short format directories. */
1626 STATIC void
1627 xrep_inode_sfdir_size(
1628 	struct xfs_scrub	*sc)
1629 {
1630 	struct xfs_ifork	*ifp;
1631 
1632 	trace_xrep_inode_sfdir_size(sc);
1633 
1634 	ifp = xfs_ifork_ptr(sc->ip, XFS_DATA_FORK);
1635 	sc->ip->i_disk_size = ifp->if_bytes;
1636 }
1637 
1638 /*
1639  * Fix any irregularities in a directory inode's size now that we can iterate
1640  * extent maps and access other regular inode data.
1641  */
1642 STATIC void
1643 xrep_inode_dir_size(
1644 	struct xfs_scrub	*sc)
1645 {
1646 	trace_xrep_inode_dir_size(sc);
1647 
1648 	switch (sc->ip->i_df.if_format) {
1649 	case XFS_DINODE_FMT_EXTENTS:
1650 	case XFS_DINODE_FMT_BTREE:
1651 		xrep_inode_blockdir_size(sc);
1652 		break;
1653 	case XFS_DINODE_FMT_LOCAL:
1654 		xrep_inode_sfdir_size(sc);
1655 		break;
1656 	}
1657 }
1658 
1659 /* Fix extent size hint problems. */
1660 STATIC void
1661 xrep_inode_extsize(
1662 	struct xfs_scrub	*sc)
1663 {
1664 	/* Fix misaligned extent size hints on a directory. */
1665 	if ((sc->ip->i_diflags & XFS_DIFLAG_RTINHERIT) &&
1666 	    (sc->ip->i_diflags & XFS_DIFLAG_EXTSZINHERIT) &&
1667 	    xfs_extlen_to_rtxmod(sc->mp, sc->ip->i_extsize) > 0) {
1668 		sc->ip->i_extsize = 0;
1669 		sc->ip->i_diflags &= ~XFS_DIFLAG_EXTSZINHERIT;
1670 	}
1671 }
1672 
1673 /* Fix any irregularities in an inode that the verifiers don't catch. */
1674 STATIC int
1675 xrep_inode_problems(
1676 	struct xfs_scrub	*sc)
1677 {
1678 	int			error;
1679 
1680 	error = xrep_inode_blockcounts(sc);
1681 	if (error)
1682 		return error;
1683 	xrep_inode_timestamps(sc->ip);
1684 	xrep_inode_flags(sc);
1685 	xrep_inode_ids(sc);
1686 	/*
1687 	 * We can now do a better job fixing the size of a directory now that
1688 	 * we can scan the data fork extents than we could in xrep_dinode_size.
1689 	 */
1690 	if (S_ISDIR(VFS_I(sc->ip)->i_mode))
1691 		xrep_inode_dir_size(sc);
1692 	xrep_inode_extsize(sc);
1693 
1694 	trace_xrep_inode_fixed(sc);
1695 	xfs_trans_log_inode(sc->tp, sc->ip, XFS_ILOG_CORE);
1696 	return xrep_roll_trans(sc);
1697 }
1698 
1699 /* Repair an inode's fields. */
1700 int
1701 xrep_inode(
1702 	struct xfs_scrub	*sc)
1703 {
1704 	int			error = 0;
1705 
1706 	/*
1707 	 * No inode?  That means we failed the _iget verifiers.  Repair all
1708 	 * the things that the inode verifiers care about, then retry _iget.
1709 	 */
1710 	if (!sc->ip) {
1711 		struct xrep_inode	*ri = sc->buf;
1712 
1713 		ASSERT(ri != NULL);
1714 
1715 		error = xrep_dinode_problems(ri);
1716 		if (error == -EBUSY) {
1717 			/*
1718 			 * Directory scan to recover inode mode encountered a
1719 			 * busy inode, so we did not continue repairing things.
1720 			 */
1721 			return 0;
1722 		}
1723 		if (error)
1724 			return error;
1725 
1726 		/* By this point we had better have a working incore inode. */
1727 		if (!sc->ip)
1728 			return -EFSCORRUPTED;
1729 	}
1730 
1731 	xfs_trans_ijoin(sc->tp, sc->ip, 0);
1732 
1733 	/* If we found corruption of any kind, try to fix it. */
1734 	if ((sc->sm->sm_flags & XFS_SCRUB_OFLAG_CORRUPT) ||
1735 	    (sc->sm->sm_flags & XFS_SCRUB_OFLAG_XCORRUPT)) {
1736 		error = xrep_inode_problems(sc);
1737 		if (error)
1738 			return error;
1739 	}
1740 
1741 	/* See if we can clear the reflink flag. */
1742 	if (xfs_is_reflink_inode(sc->ip)) {
1743 		error = xfs_reflink_clear_inode_flag(sc->ip, &sc->tp);
1744 		if (error)
1745 			return error;
1746 	}
1747 
1748 	return xrep_defer_finish(sc);
1749 }
1750